Print this page
    
5882 Temporary pool names
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Igor Kozhukhov <igor@dilos.org>
Reviewed by: John Kennedy <john.kennedy@delphix.com>
Approved by: Dan McDonald <danmcd@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/spa.c
          +++ new/usr/src/uts/common/fs/zfs/spa.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2011, 2018 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2015, Nexenta Systems, Inc.  All rights reserved.
  26   26   * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  27   27   * Copyright 2013 Saso Kiselkov. All rights reserved.
  28   28   * Copyright (c) 2014 Integros [integros.com]
  29   29   * Copyright 2016 Toomas Soome <tsoome@me.com>
  30   30   * Copyright 2018 Joyent, Inc.
  31   31   * Copyright (c) 2017 Datto Inc.
  32   32   * Copyright 2018 OmniOS Community Edition (OmniOSce) Association.
  33   33   */
  34   34  
  35   35  /*
  36   36   * SPA: Storage Pool Allocator
  37   37   *
  38   38   * This file contains all the routines used when modifying on-disk SPA state.
  39   39   * This includes opening, importing, destroying, exporting a pool, and syncing a
  40   40   * pool.
  41   41   */
  42   42  
  43   43  #include <sys/zfs_context.h>
  44   44  #include <sys/fm/fs/zfs.h>
  45   45  #include <sys/spa_impl.h>
  46   46  #include <sys/zio.h>
  47   47  #include <sys/zio_checksum.h>
  48   48  #include <sys/dmu.h>
  49   49  #include <sys/dmu_tx.h>
  50   50  #include <sys/zap.h>
  51   51  #include <sys/zil.h>
  52   52  #include <sys/ddt.h>
  53   53  #include <sys/vdev_impl.h>
  54   54  #include <sys/vdev_removal.h>
  55   55  #include <sys/vdev_indirect_mapping.h>
  56   56  #include <sys/vdev_indirect_births.h>
  57   57  #include <sys/vdev_initialize.h>
  58   58  #include <sys/metaslab.h>
  59   59  #include <sys/metaslab_impl.h>
  60   60  #include <sys/uberblock_impl.h>
  61   61  #include <sys/txg.h>
  62   62  #include <sys/avl.h>
  63   63  #include <sys/bpobj.h>
  64   64  #include <sys/dmu_traverse.h>
  65   65  #include <sys/dmu_objset.h>
  66   66  #include <sys/unique.h>
  67   67  #include <sys/dsl_pool.h>
  68   68  #include <sys/dsl_dataset.h>
  69   69  #include <sys/dsl_dir.h>
  70   70  #include <sys/dsl_prop.h>
  71   71  #include <sys/dsl_synctask.h>
  72   72  #include <sys/fs/zfs.h>
  73   73  #include <sys/arc.h>
  74   74  #include <sys/callb.h>
  75   75  #include <sys/systeminfo.h>
  76   76  #include <sys/spa_boot.h>
  77   77  #include <sys/zfs_ioctl.h>
  78   78  #include <sys/dsl_scan.h>
  79   79  #include <sys/zfeature.h>
  80   80  #include <sys/dsl_destroy.h>
  81   81  #include <sys/abd.h>
  82   82  
  83   83  #ifdef  _KERNEL
  84   84  #include <sys/bootprops.h>
  85   85  #include <sys/callb.h>
  86   86  #include <sys/cpupart.h>
  87   87  #include <sys/pool.h>
  88   88  #include <sys/sysdc.h>
  89   89  #include <sys/zone.h>
  90   90  #endif  /* _KERNEL */
  91   91  
  92   92  #include "zfs_prop.h"
  93   93  #include "zfs_comutil.h"
  94   94  
  95   95  /*
  96   96   * The interval, in seconds, at which failed configuration cache file writes
  97   97   * should be retried.
  98   98   */
  99   99  int zfs_ccw_retry_interval = 300;
 100  100  
 101  101  typedef enum zti_modes {
 102  102          ZTI_MODE_FIXED,                 /* value is # of threads (min 1) */
 103  103          ZTI_MODE_BATCH,                 /* cpu-intensive; value is ignored */
 104  104          ZTI_MODE_NULL,                  /* don't create a taskq */
 105  105          ZTI_NMODES
 106  106  } zti_modes_t;
 107  107  
 108  108  #define ZTI_P(n, q)     { ZTI_MODE_FIXED, (n), (q) }
 109  109  #define ZTI_BATCH       { ZTI_MODE_BATCH, 0, 1 }
 110  110  #define ZTI_NULL        { ZTI_MODE_NULL, 0, 0 }
 111  111  
 112  112  #define ZTI_N(n)        ZTI_P(n, 1)
 113  113  #define ZTI_ONE         ZTI_N(1)
 114  114  
 115  115  typedef struct zio_taskq_info {
 116  116          zti_modes_t zti_mode;
 117  117          uint_t zti_value;
 118  118          uint_t zti_count;
 119  119  } zio_taskq_info_t;
 120  120  
 121  121  static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 122  122          "issue", "issue_high", "intr", "intr_high"
 123  123  };
 124  124  
 125  125  /*
 126  126   * This table defines the taskq settings for each ZFS I/O type. When
 127  127   * initializing a pool, we use this table to create an appropriately sized
 128  128   * taskq. Some operations are low volume and therefore have a small, static
 129  129   * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
 130  130   * macros. Other operations process a large amount of data; the ZTI_BATCH
 131  131   * macro causes us to create a taskq oriented for throughput. Some operations
 132  132   * are so high frequency and short-lived that the taskq itself can become a a
 133  133   * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
 134  134   * additional degree of parallelism specified by the number of threads per-
 135  135   * taskq and the number of taskqs; when dispatching an event in this case, the
 136  136   * particular taskq is chosen at random.
 137  137   *
 138  138   * The different taskq priorities are to handle the different contexts (issue
 139  139   * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
 140  140   * need to be handled with minimum delay.
 141  141   */
 142  142  const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 143  143          /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 144  144          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* NULL */
 145  145          { ZTI_N(8),     ZTI_NULL,       ZTI_P(12, 8),   ZTI_NULL }, /* READ */
 146  146          { ZTI_BATCH,    ZTI_N(5),       ZTI_N(8),       ZTI_N(5) }, /* WRITE */
 147  147          { ZTI_P(12, 8), ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* FREE */
 148  148          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* CLAIM */
 149  149          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL }, /* IOCTL */
 150  150  };
 151  151  
 152  152  static void spa_sync_version(void *arg, dmu_tx_t *tx);
 153  153  static void spa_sync_props(void *arg, dmu_tx_t *tx);
 154  154  static boolean_t spa_has_active_shared_spare(spa_t *spa);
 155  155  static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport);
 156  156  static void spa_vdev_resilver_done(spa_t *spa);
 157  157  
 158  158  uint_t          zio_taskq_batch_pct = 75;       /* 1 thread per cpu in pset */
 159  159  id_t            zio_taskq_psrset_bind = PS_NONE;
 160  160  boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 161  161  uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 162  162  
 163  163  boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 164  164  extern int      zfs_sync_pass_deferred_free;
 165  165  
 166  166  /*
 167  167   * Report any spa_load_verify errors found, but do not fail spa_load.
 168  168   * This is used by zdb to analyze non-idle pools.
 169  169   */
 170  170  boolean_t       spa_load_verify_dryrun = B_FALSE;
 171  171  
 172  172  /*
 173  173   * This (illegal) pool name is used when temporarily importing a spa_t in order
 174  174   * to get the vdev stats associated with the imported devices.
 175  175   */
 176  176  #define TRYIMPORT_NAME  "$import"
 177  177  
 178  178  /*
 179  179   * For debugging purposes: print out vdev tree during pool import.
 180  180   */
 181  181  boolean_t       spa_load_print_vdev_tree = B_FALSE;
 182  182  
 183  183  /*
 184  184   * A non-zero value for zfs_max_missing_tvds means that we allow importing
 185  185   * pools with missing top-level vdevs. This is strictly intended for advanced
 186  186   * pool recovery cases since missing data is almost inevitable. Pools with
 187  187   * missing devices can only be imported read-only for safety reasons, and their
 188  188   * fail-mode will be automatically set to "continue".
 189  189   *
 190  190   * With 1 missing vdev we should be able to import the pool and mount all
 191  191   * datasets. User data that was not modified after the missing device has been
 192  192   * added should be recoverable. This means that snapshots created prior to the
 193  193   * addition of that device should be completely intact.
 194  194   *
 195  195   * With 2 missing vdevs, some datasets may fail to mount since there are
 196  196   * dataset statistics that are stored as regular metadata. Some data might be
 197  197   * recoverable if those vdevs were added recently.
 198  198   *
 199  199   * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
 200  200   * may be missing entirely. Chances of data recovery are very low. Note that
 201  201   * there are also risks of performing an inadvertent rewind as we might be
 202  202   * missing all the vdevs with the latest uberblocks.
 203  203   */
 204  204  uint64_t        zfs_max_missing_tvds = 0;
 205  205  
 206  206  /*
 207  207   * The parameters below are similar to zfs_max_missing_tvds but are only
 208  208   * intended for a preliminary open of the pool with an untrusted config which
 209  209   * might be incomplete or out-dated.
 210  210   *
 211  211   * We are more tolerant for pools opened from a cachefile since we could have
 212  212   * an out-dated cachefile where a device removal was not registered.
 213  213   * We could have set the limit arbitrarily high but in the case where devices
 214  214   * are really missing we would want to return the proper error codes; we chose
 215  215   * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
 216  216   * and we get a chance to retrieve the trusted config.
 217  217   */
 218  218  uint64_t        zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 219  219  
 220  220  /*
 221  221   * In the case where config was assembled by scanning device paths (/dev/dsks
 222  222   * by default) we are less tolerant since all the existing devices should have
 223  223   * been detected and we want spa_load to return the right error codes.
 224  224   */
 225  225  uint64_t        zfs_max_missing_tvds_scan = 0;
 226  226  
 227  227  /*
 228  228   * Debugging aid that pauses spa_sync() towards the end.
 229  229   */
 230  230  boolean_t       zfs_pause_spa_sync = B_FALSE;
 231  231  
 232  232  /*
 233  233   * ==========================================================================
 234  234   * SPA properties routines
 235  235   * ==========================================================================
 236  236   */
 237  237  
 238  238  /*
 239  239   * Add a (source=src, propname=propval) list to an nvlist.
 240  240   */
 241  241  static void
 242  242  spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 243  243      uint64_t intval, zprop_source_t src)
 244  244  {
 245  245          const char *propname = zpool_prop_to_name(prop);
 246  246          nvlist_t *propval;
 247  247  
 248  248          VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 249  249          VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 250  250  
 251  251          if (strval != NULL)
 252  252                  VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 253  253          else
 254  254                  VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 255  255  
 256  256          VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 257  257          nvlist_free(propval);
 258  258  }
 259  259  
 260  260  /*
 261  261   * Get property values from the spa configuration.
 262  262   */
 263  263  static void
 264  264  spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 265  265  {
 266  266          vdev_t *rvd = spa->spa_root_vdev;
 267  267          dsl_pool_t *pool = spa->spa_dsl_pool;
 268  268          uint64_t size, alloc, cap, version;
 269  269          zprop_source_t src = ZPROP_SRC_NONE;
 270  270          spa_config_dirent_t *dp;
 271  271          metaslab_class_t *mc = spa_normal_class(spa);
 272  272  
 273  273          ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 274  274  
 275  275          if (rvd != NULL) {
 276  276                  alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 277  277                  size = metaslab_class_get_space(spa_normal_class(spa));
 278  278                  spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 279  279                  spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 280  280                  spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 281  281                  spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 282  282                      size - alloc, src);
 283  283                  spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL,
 284  284                      spa->spa_checkpoint_info.sci_dspace, src);
 285  285  
 286  286                  spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 287  287                      metaslab_class_fragmentation(mc), src);
 288  288                  spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 289  289                      metaslab_class_expandable_space(mc), src);
 290  290                  spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 291  291                      (spa_mode(spa) == FREAD), src);
 292  292  
 293  293                  cap = (size == 0) ? 0 : (alloc * 100 / size);
 294  294                  spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 295  295  
 296  296                  spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 297  297                      ddt_get_pool_dedup_ratio(spa), src);
 298  298  
 299  299                  spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 300  300                      rvd->vdev_state, src);
 301  301  
 302  302                  version = spa_version(spa);
 303  303                  if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 304  304                          src = ZPROP_SRC_DEFAULT;
 305  305                  else
 306  306                          src = ZPROP_SRC_LOCAL;
 307  307                  spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 308  308          }
 309  309  
 310  310          if (pool != NULL) {
 311  311                  /*
 312  312                   * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 313  313                   * when opening pools before this version freedir will be NULL.
 314  314                   */
 315  315                  if (pool->dp_free_dir != NULL) {
 316  316                          spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 317  317                              dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 318  318                              src);
 319  319                  } else {
 320  320                          spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 321  321                              NULL, 0, src);
 322  322                  }
 323  323  
 324  324                  if (pool->dp_leak_dir != NULL) {
 325  325                          spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 326  326                              dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 327  327                              src);
 328  328                  } else {
 329  329                          spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 330  330                              NULL, 0, src);
 331  331                  }
 332  332          }
 333  333  
 334  334          spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 335  335  
 336  336          if (spa->spa_comment != NULL) {
 337  337                  spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 338  338                      0, ZPROP_SRC_LOCAL);
 339  339          }
 340  340  
 341  341          if (spa->spa_root != NULL)
 342  342                  spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 343  343                      0, ZPROP_SRC_LOCAL);
 344  344  
 345  345          if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 346  346                  spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 347  347                      MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 348  348          } else {
 349  349                  spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 350  350                      SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 351  351          }
 352  352  
 353  353          if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 354  354                  if (dp->scd_path == NULL) {
 355  355                          spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 356  356                              "none", 0, ZPROP_SRC_LOCAL);
 357  357                  } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 358  358                          spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 359  359                              dp->scd_path, 0, ZPROP_SRC_LOCAL);
 360  360                  }
 361  361          }
 362  362  }
 363  363  
 364  364  /*
 365  365   * Get zpool property values.
 366  366   */
 367  367  int
 368  368  spa_prop_get(spa_t *spa, nvlist_t **nvp)
 369  369  {
 370  370          objset_t *mos = spa->spa_meta_objset;
 371  371          zap_cursor_t zc;
 372  372          zap_attribute_t za;
 373  373          int err;
 374  374  
 375  375          VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 376  376  
 377  377          mutex_enter(&spa->spa_props_lock);
 378  378  
 379  379          /*
 380  380           * Get properties from the spa config.
 381  381           */
 382  382          spa_prop_get_config(spa, nvp);
 383  383  
 384  384          /* If no pool property object, no more prop to get. */
 385  385          if (mos == NULL || spa->spa_pool_props_object == 0) {
 386  386                  mutex_exit(&spa->spa_props_lock);
 387  387                  return (0);
 388  388          }
 389  389  
 390  390          /*
 391  391           * Get properties from the MOS pool property object.
 392  392           */
 393  393          for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 394  394              (err = zap_cursor_retrieve(&zc, &za)) == 0;
 395  395              zap_cursor_advance(&zc)) {
 396  396                  uint64_t intval = 0;
 397  397                  char *strval = NULL;
 398  398                  zprop_source_t src = ZPROP_SRC_DEFAULT;
 399  399                  zpool_prop_t prop;
 400  400  
 401  401                  if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL)
 402  402                          continue;
 403  403  
 404  404                  switch (za.za_integer_length) {
 405  405                  case 8:
 406  406                          /* integer property */
 407  407                          if (za.za_first_integer !=
 408  408                              zpool_prop_default_numeric(prop))
 409  409                                  src = ZPROP_SRC_LOCAL;
 410  410  
 411  411                          if (prop == ZPOOL_PROP_BOOTFS) {
 412  412                                  dsl_pool_t *dp;
 413  413                                  dsl_dataset_t *ds = NULL;
 414  414  
 415  415                                  dp = spa_get_dsl(spa);
 416  416                                  dsl_pool_config_enter(dp, FTAG);
 417  417                                  err = dsl_dataset_hold_obj(dp,
 418  418                                      za.za_first_integer, FTAG, &ds);
 419  419                                  if (err != 0) {
 420  420                                          dsl_pool_config_exit(dp, FTAG);
 421  421                                          break;
 422  422                                  }
 423  423  
 424  424                                  strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 425  425                                      KM_SLEEP);
 426  426                                  dsl_dataset_name(ds, strval);
 427  427                                  dsl_dataset_rele(ds, FTAG);
 428  428                                  dsl_pool_config_exit(dp, FTAG);
 429  429                          } else {
 430  430                                  strval = NULL;
 431  431                                  intval = za.za_first_integer;
 432  432                          }
 433  433  
 434  434                          spa_prop_add_list(*nvp, prop, strval, intval, src);
 435  435  
 436  436                          if (strval != NULL)
 437  437                                  kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 438  438  
 439  439                          break;
 440  440  
 441  441                  case 1:
 442  442                          /* string property */
 443  443                          strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 444  444                          err = zap_lookup(mos, spa->spa_pool_props_object,
 445  445                              za.za_name, 1, za.za_num_integers, strval);
 446  446                          if (err) {
 447  447                                  kmem_free(strval, za.za_num_integers);
 448  448                                  break;
 449  449                          }
 450  450                          spa_prop_add_list(*nvp, prop, strval, 0, src);
 451  451                          kmem_free(strval, za.za_num_integers);
 452  452                          break;
 453  453  
 454  454                  default:
 455  455                          break;
 456  456                  }
 457  457          }
 458  458          zap_cursor_fini(&zc);
 459  459          mutex_exit(&spa->spa_props_lock);
 460  460  out:
 461  461          if (err && err != ENOENT) {
 462  462                  nvlist_free(*nvp);
 463  463                  *nvp = NULL;
 464  464                  return (err);
 465  465          }
 466  466  
 467  467          return (0);
 468  468  }
 469  469  
 470  470  /*
 471  471   * Validate the given pool properties nvlist and modify the list
 472  472   * for the property values to be set.
 473  473   */
 474  474  static int
 475  475  spa_prop_validate(spa_t *spa, nvlist_t *props)
 476  476  {
 477  477          nvpair_t *elem;
 478  478          int error = 0, reset_bootfs = 0;
 479  479          uint64_t objnum = 0;
 480  480          boolean_t has_feature = B_FALSE;
 481  481  
 482  482          elem = NULL;
 483  483          while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 484  484                  uint64_t intval;
 485  485                  char *strval, *slash, *check, *fname;
 486  486                  const char *propname = nvpair_name(elem);
 487  487                  zpool_prop_t prop = zpool_name_to_prop(propname);
 488  488  
 489  489                  switch (prop) {
 490  490                  case ZPOOL_PROP_INVAL:
 491  491                          if (!zpool_prop_feature(propname)) {
 492  492                                  error = SET_ERROR(EINVAL);
 493  493                                  break;
 494  494                          }
 495  495  
 496  496                          /*
 497  497                           * Sanitize the input.
 498  498                           */
 499  499                          if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 500  500                                  error = SET_ERROR(EINVAL);
 501  501                                  break;
 502  502                          }
 503  503  
 504  504                          if (nvpair_value_uint64(elem, &intval) != 0) {
 505  505                                  error = SET_ERROR(EINVAL);
 506  506                                  break;
 507  507                          }
 508  508  
 509  509                          if (intval != 0) {
 510  510                                  error = SET_ERROR(EINVAL);
 511  511                                  break;
 512  512                          }
 513  513  
 514  514                          fname = strchr(propname, '@') + 1;
 515  515                          if (zfeature_lookup_name(fname, NULL) != 0) {
 516  516                                  error = SET_ERROR(EINVAL);
 517  517                                  break;
 518  518                          }
 519  519  
 520  520                          has_feature = B_TRUE;
 521  521                          break;
 522  522  
 523  523                  case ZPOOL_PROP_VERSION:
 524  524                          error = nvpair_value_uint64(elem, &intval);
 525  525                          if (!error &&
 526  526                              (intval < spa_version(spa) ||
 527  527                              intval > SPA_VERSION_BEFORE_FEATURES ||
 528  528                              has_feature))
 529  529                                  error = SET_ERROR(EINVAL);
 530  530                          break;
 531  531  
 532  532                  case ZPOOL_PROP_DELEGATION:
 533  533                  case ZPOOL_PROP_AUTOREPLACE:
 534  534                  case ZPOOL_PROP_LISTSNAPS:
 535  535                  case ZPOOL_PROP_AUTOEXPAND:
 536  536                          error = nvpair_value_uint64(elem, &intval);
 537  537                          if (!error && intval > 1)
 538  538                                  error = SET_ERROR(EINVAL);
 539  539                          break;
 540  540  
 541  541                  case ZPOOL_PROP_BOOTFS:
 542  542                          /*
 543  543                           * If the pool version is less than SPA_VERSION_BOOTFS,
 544  544                           * or the pool is still being created (version == 0),
 545  545                           * the bootfs property cannot be set.
 546  546                           */
 547  547                          if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 548  548                                  error = SET_ERROR(ENOTSUP);
 549  549                                  break;
 550  550                          }
 551  551  
 552  552                          /*
 553  553                           * Make sure the vdev config is bootable
 554  554                           */
 555  555                          if (!vdev_is_bootable(spa->spa_root_vdev)) {
 556  556                                  error = SET_ERROR(ENOTSUP);
 557  557                                  break;
 558  558                          }
 559  559  
 560  560                          reset_bootfs = 1;
 561  561  
 562  562                          error = nvpair_value_string(elem, &strval);
 563  563  
 564  564                          if (!error) {
 565  565                                  objset_t *os;
 566  566                                  uint64_t propval;
 567  567  
 568  568                                  if (strval == NULL || strval[0] == '\0') {
 569  569                                          objnum = zpool_prop_default_numeric(
 570  570                                              ZPOOL_PROP_BOOTFS);
 571  571                                          break;
 572  572                                  }
 573  573  
 574  574                                  error = dmu_objset_hold(strval, FTAG, &os);
 575  575                                  if (error != 0)
 576  576                                          break;
 577  577  
 578  578                                  /*
 579  579                                   * Must be ZPL, and its property settings
 580  580                                   * must be supported by GRUB (compression
 581  581                                   * is not gzip, and large blocks are not used).
 582  582                                   */
 583  583  
 584  584                                  if (dmu_objset_type(os) != DMU_OST_ZFS) {
 585  585                                          error = SET_ERROR(ENOTSUP);
 586  586                                  } else if ((error =
 587  587                                      dsl_prop_get_int_ds(dmu_objset_ds(os),
 588  588                                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 589  589                                      &propval)) == 0 &&
 590  590                                      !BOOTFS_COMPRESS_VALID(propval)) {
 591  591                                          error = SET_ERROR(ENOTSUP);
 592  592                                  } else {
 593  593                                          objnum = dmu_objset_id(os);
 594  594                                  }
 595  595                                  dmu_objset_rele(os, FTAG);
 596  596                          }
 597  597                          break;
 598  598  
 599  599                  case ZPOOL_PROP_FAILUREMODE:
 600  600                          error = nvpair_value_uint64(elem, &intval);
 601  601                          if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 602  602                              intval > ZIO_FAILURE_MODE_PANIC))
 603  603                                  error = SET_ERROR(EINVAL);
 604  604  
 605  605                          /*
 606  606                           * This is a special case which only occurs when
 607  607                           * the pool has completely failed. This allows
 608  608                           * the user to change the in-core failmode property
 609  609                           * without syncing it out to disk (I/Os might
 610  610                           * currently be blocked). We do this by returning
 611  611                           * EIO to the caller (spa_prop_set) to trick it
 612  612                           * into thinking we encountered a property validation
 613  613                           * error.
 614  614                           */
 615  615                          if (!error && spa_suspended(spa)) {
 616  616                                  spa->spa_failmode = intval;
 617  617                                  error = SET_ERROR(EIO);
 618  618                          }
 619  619                          break;
 620  620  
 621  621                  case ZPOOL_PROP_CACHEFILE:
 622  622                          if ((error = nvpair_value_string(elem, &strval)) != 0)
 623  623                                  break;
 624  624  
 625  625                          if (strval[0] == '\0')
 626  626                                  break;
 627  627  
 628  628                          if (strcmp(strval, "none") == 0)
 629  629                                  break;
 630  630  
 631  631                          if (strval[0] != '/') {
 632  632                                  error = SET_ERROR(EINVAL);
 633  633                                  break;
 634  634                          }
 635  635  
 636  636                          slash = strrchr(strval, '/');
 637  637                          ASSERT(slash != NULL);
 638  638  
 639  639                          if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 640  640                              strcmp(slash, "/..") == 0)
 641  641                                  error = SET_ERROR(EINVAL);
 642  642                          break;
 643  643  
 644  644                  case ZPOOL_PROP_COMMENT:
 645  645                          if ((error = nvpair_value_string(elem, &strval)) != 0)
 646  646                                  break;
 647  647                          for (check = strval; *check != '\0'; check++) {
 648  648                                  /*
 649  649                                   * The kernel doesn't have an easy isprint()
 650  650                                   * check.  For this kernel check, we merely
 651  651                                   * check ASCII apart from DEL.  Fix this if
 652  652                                   * there is an easy-to-use kernel isprint().
 653  653                                   */
 654  654                                  if (*check >= 0x7f) {
 655  655                                          error = SET_ERROR(EINVAL);
 656  656                                          break;
 657  657                                  }
 658  658                          }
 659  659                          if (strlen(strval) > ZPROP_MAX_COMMENT)
 660  660                                  error = E2BIG;
 661  661                          break;
 662  662  
 663  663                  case ZPOOL_PROP_DEDUPDITTO:
 664  664                          if (spa_version(spa) < SPA_VERSION_DEDUP)
 665  665                                  error = SET_ERROR(ENOTSUP);
 666  666                          else
 667  667                                  error = nvpair_value_uint64(elem, &intval);
 668  668                          if (error == 0 &&
 669  669                              intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 670  670                                  error = SET_ERROR(EINVAL);
 671  671                          break;
 672  672                  }
 673  673  
 674  674                  if (error)
 675  675                          break;
 676  676          }
 677  677  
 678  678          if (!error && reset_bootfs) {
 679  679                  error = nvlist_remove(props,
 680  680                      zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 681  681  
 682  682                  if (!error) {
 683  683                          error = nvlist_add_uint64(props,
 684  684                              zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 685  685                  }
 686  686          }
 687  687  
 688  688          return (error);
 689  689  }
 690  690  
 691  691  void
 692  692  spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 693  693  {
 694  694          char *cachefile;
 695  695          spa_config_dirent_t *dp;
 696  696  
 697  697          if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 698  698              &cachefile) != 0)
 699  699                  return;
 700  700  
 701  701          dp = kmem_alloc(sizeof (spa_config_dirent_t),
 702  702              KM_SLEEP);
 703  703  
 704  704          if (cachefile[0] == '\0')
 705  705                  dp->scd_path = spa_strdup(spa_config_path);
 706  706          else if (strcmp(cachefile, "none") == 0)
 707  707                  dp->scd_path = NULL;
 708  708          else
 709  709                  dp->scd_path = spa_strdup(cachefile);
 710  710  
 711  711          list_insert_head(&spa->spa_config_list, dp);
 712  712          if (need_sync)
 713  713                  spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 714  714  }
 715  715  
 716  716  int
 717  717  spa_prop_set(spa_t *spa, nvlist_t *nvp)
 718  718  {
 719  719          int error;
 720  720          nvpair_t *elem = NULL;
 721  721          boolean_t need_sync = B_FALSE;
 722  722  
 723  723          if ((error = spa_prop_validate(spa, nvp)) != 0)
 724  724                  return (error);
 725  725  
 726  726          while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 727  727                  zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 728  728  
 729  729                  if (prop == ZPOOL_PROP_CACHEFILE ||
 730  730                      prop == ZPOOL_PROP_ALTROOT ||
 731  731                      prop == ZPOOL_PROP_READONLY)
 732  732                          continue;
 733  733  
 734  734                  if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
 735  735                          uint64_t ver;
 736  736  
 737  737                          if (prop == ZPOOL_PROP_VERSION) {
 738  738                                  VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 739  739                          } else {
 740  740                                  ASSERT(zpool_prop_feature(nvpair_name(elem)));
 741  741                                  ver = SPA_VERSION_FEATURES;
 742  742                                  need_sync = B_TRUE;
 743  743                          }
 744  744  
 745  745                          /* Save time if the version is already set. */
 746  746                          if (ver == spa_version(spa))
 747  747                                  continue;
 748  748  
 749  749                          /*
 750  750                           * In addition to the pool directory object, we might
 751  751                           * create the pool properties object, the features for
 752  752                           * read object, the features for write object, or the
 753  753                           * feature descriptions object.
 754  754                           */
 755  755                          error = dsl_sync_task(spa->spa_name, NULL,
 756  756                              spa_sync_version, &ver,
 757  757                              6, ZFS_SPACE_CHECK_RESERVED);
 758  758                          if (error)
 759  759                                  return (error);
 760  760                          continue;
 761  761                  }
 762  762  
 763  763                  need_sync = B_TRUE;
 764  764                  break;
 765  765          }
 766  766  
 767  767          if (need_sync) {
 768  768                  return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 769  769                      nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 770  770          }
 771  771  
 772  772          return (0);
 773  773  }
 774  774  
 775  775  /*
 776  776   * If the bootfs property value is dsobj, clear it.
 777  777   */
 778  778  void
 779  779  spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 780  780  {
 781  781          if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 782  782                  VERIFY(zap_remove(spa->spa_meta_objset,
 783  783                      spa->spa_pool_props_object,
 784  784                      zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 785  785                  spa->spa_bootfs = 0;
 786  786          }
 787  787  }
 788  788  
 789  789  /*ARGSUSED*/
 790  790  static int
 791  791  spa_change_guid_check(void *arg, dmu_tx_t *tx)
 792  792  {
 793  793          uint64_t *newguid = arg;
 794  794          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 795  795          vdev_t *rvd = spa->spa_root_vdev;
 796  796          uint64_t vdev_state;
 797  797  
 798  798          if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 799  799                  int error = (spa_has_checkpoint(spa)) ?
 800  800                      ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 801  801                  return (SET_ERROR(error));
 802  802          }
 803  803  
 804  804          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 805  805          vdev_state = rvd->vdev_state;
 806  806          spa_config_exit(spa, SCL_STATE, FTAG);
 807  807  
 808  808          if (vdev_state != VDEV_STATE_HEALTHY)
 809  809                  return (SET_ERROR(ENXIO));
 810  810  
 811  811          ASSERT3U(spa_guid(spa), !=, *newguid);
 812  812  
 813  813          return (0);
 814  814  }
 815  815  
 816  816  static void
 817  817  spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 818  818  {
 819  819          uint64_t *newguid = arg;
 820  820          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 821  821          uint64_t oldguid;
 822  822          vdev_t *rvd = spa->spa_root_vdev;
 823  823  
 824  824          oldguid = spa_guid(spa);
 825  825  
 826  826          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 827  827          rvd->vdev_guid = *newguid;
 828  828          rvd->vdev_guid_sum += (*newguid - oldguid);
 829  829          vdev_config_dirty(rvd);
 830  830          spa_config_exit(spa, SCL_STATE, FTAG);
 831  831  
 832  832          spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 833  833              oldguid, *newguid);
 834  834  }
 835  835  
 836  836  /*
 837  837   * Change the GUID for the pool.  This is done so that we can later
 838  838   * re-import a pool built from a clone of our own vdevs.  We will modify
 839  839   * the root vdev's guid, our own pool guid, and then mark all of our
 840  840   * vdevs dirty.  Note that we must make sure that all our vdevs are
 841  841   * online when we do this, or else any vdevs that weren't present
 842  842   * would be orphaned from our pool.  We are also going to issue a
 843  843   * sysevent to update any watchers.
 844  844   */
 845  845  int
 846  846  spa_change_guid(spa_t *spa)
 847  847  {
 848  848          int error;
 849  849          uint64_t guid;
 850  850  
 851  851          mutex_enter(&spa->spa_vdev_top_lock);
 852  852          mutex_enter(&spa_namespace_lock);
 853  853          guid = spa_generate_guid(NULL);
 854  854  
 855  855          error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 856  856              spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 857  857  
 858  858          if (error == 0) {
 859  859                  spa_write_cachefile(spa, B_FALSE, B_TRUE);
 860  860                  spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 861  861          }
 862  862  
 863  863          mutex_exit(&spa_namespace_lock);
 864  864          mutex_exit(&spa->spa_vdev_top_lock);
 865  865  
 866  866          return (error);
 867  867  }
 868  868  
 869  869  /*
 870  870   * ==========================================================================
 871  871   * SPA state manipulation (open/create/destroy/import/export)
 872  872   * ==========================================================================
 873  873   */
 874  874  
 875  875  static int
 876  876  spa_error_entry_compare(const void *a, const void *b)
 877  877  {
 878  878          spa_error_entry_t *sa = (spa_error_entry_t *)a;
 879  879          spa_error_entry_t *sb = (spa_error_entry_t *)b;
 880  880          int ret;
 881  881  
 882  882          ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 883  883              sizeof (zbookmark_phys_t));
 884  884  
 885  885          if (ret < 0)
 886  886                  return (-1);
 887  887          else if (ret > 0)
 888  888                  return (1);
 889  889          else
 890  890                  return (0);
 891  891  }
 892  892  
 893  893  /*
 894  894   * Utility function which retrieves copies of the current logs and
 895  895   * re-initializes them in the process.
 896  896   */
 897  897  void
 898  898  spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 899  899  {
 900  900          ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 901  901  
 902  902          bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 903  903          bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 904  904  
 905  905          avl_create(&spa->spa_errlist_scrub,
 906  906              spa_error_entry_compare, sizeof (spa_error_entry_t),
 907  907              offsetof(spa_error_entry_t, se_avl));
 908  908          avl_create(&spa->spa_errlist_last,
 909  909              spa_error_entry_compare, sizeof (spa_error_entry_t),
 910  910              offsetof(spa_error_entry_t, se_avl));
 911  911  }
 912  912  
 913  913  static void
 914  914  spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 915  915  {
 916  916          const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 917  917          enum zti_modes mode = ztip->zti_mode;
 918  918          uint_t value = ztip->zti_value;
 919  919          uint_t count = ztip->zti_count;
 920  920          spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 921  921          char name[32];
 922  922          uint_t flags = 0;
 923  923          boolean_t batch = B_FALSE;
 924  924  
 925  925          if (mode == ZTI_MODE_NULL) {
 926  926                  tqs->stqs_count = 0;
 927  927                  tqs->stqs_taskq = NULL;
 928  928                  return;
 929  929          }
 930  930  
 931  931          ASSERT3U(count, >, 0);
 932  932  
 933  933          tqs->stqs_count = count;
 934  934          tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 935  935  
 936  936          switch (mode) {
 937  937          case ZTI_MODE_FIXED:
 938  938                  ASSERT3U(value, >=, 1);
 939  939                  value = MAX(value, 1);
 940  940                  break;
 941  941  
 942  942          case ZTI_MODE_BATCH:
 943  943                  batch = B_TRUE;
 944  944                  flags |= TASKQ_THREADS_CPU_PCT;
 945  945                  value = zio_taskq_batch_pct;
 946  946                  break;
 947  947  
 948  948          default:
 949  949                  panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 950  950                      "spa_activate()",
 951  951                      zio_type_name[t], zio_taskq_types[q], mode, value);
 952  952                  break;
 953  953          }
 954  954  
 955  955          for (uint_t i = 0; i < count; i++) {
 956  956                  taskq_t *tq;
 957  957  
 958  958                  if (count > 1) {
 959  959                          (void) snprintf(name, sizeof (name), "%s_%s_%u",
 960  960                              zio_type_name[t], zio_taskq_types[q], i);
 961  961                  } else {
 962  962                          (void) snprintf(name, sizeof (name), "%s_%s",
 963  963                              zio_type_name[t], zio_taskq_types[q]);
 964  964                  }
 965  965  
 966  966                  if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 967  967                          if (batch)
 968  968                                  flags |= TASKQ_DC_BATCH;
 969  969  
 970  970                          tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 971  971                              spa->spa_proc, zio_taskq_basedc, flags);
 972  972                  } else {
 973  973                          pri_t pri = maxclsyspri;
 974  974                          /*
 975  975                           * The write issue taskq can be extremely CPU
 976  976                           * intensive.  Run it at slightly lower priority
 977  977                           * than the other taskqs.
 978  978                           */
 979  979                          if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 980  980                                  pri--;
 981  981  
 982  982                          tq = taskq_create_proc(name, value, pri, 50,
 983  983                              INT_MAX, spa->spa_proc, flags);
 984  984                  }
 985  985  
 986  986                  tqs->stqs_taskq[i] = tq;
 987  987          }
 988  988  }
 989  989  
 990  990  static void
 991  991  spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 992  992  {
 993  993          spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 994  994  
 995  995          if (tqs->stqs_taskq == NULL) {
 996  996                  ASSERT0(tqs->stqs_count);
 997  997                  return;
 998  998          }
 999  999  
1000 1000          for (uint_t i = 0; i < tqs->stqs_count; i++) {
1001 1001                  ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
1002 1002                  taskq_destroy(tqs->stqs_taskq[i]);
1003 1003          }
1004 1004  
1005 1005          kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
1006 1006          tqs->stqs_taskq = NULL;
1007 1007  }
1008 1008  
1009 1009  /*
1010 1010   * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
1011 1011   * Note that a type may have multiple discrete taskqs to avoid lock contention
1012 1012   * on the taskq itself. In that case we choose which taskq at random by using
1013 1013   * the low bits of gethrtime().
1014 1014   */
1015 1015  void
1016 1016  spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
1017 1017      task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
1018 1018  {
1019 1019          spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1020 1020          taskq_t *tq;
1021 1021  
1022 1022          ASSERT3P(tqs->stqs_taskq, !=, NULL);
1023 1023          ASSERT3U(tqs->stqs_count, !=, 0);
1024 1024  
1025 1025          if (tqs->stqs_count == 1) {
1026 1026                  tq = tqs->stqs_taskq[0];
1027 1027          } else {
1028 1028                  tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
1029 1029          }
1030 1030  
1031 1031          taskq_dispatch_ent(tq, func, arg, flags, ent);
1032 1032  }
1033 1033  
1034 1034  static void
1035 1035  spa_create_zio_taskqs(spa_t *spa)
1036 1036  {
1037 1037          for (int t = 0; t < ZIO_TYPES; t++) {
1038 1038                  for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1039 1039                          spa_taskqs_init(spa, t, q);
1040 1040                  }
1041 1041          }
1042 1042  }
1043 1043  
1044 1044  #ifdef _KERNEL
1045 1045  static void
1046 1046  spa_thread(void *arg)
1047 1047  {
1048 1048          callb_cpr_t cprinfo;
1049 1049  
1050 1050          spa_t *spa = arg;
1051 1051          user_t *pu = PTOU(curproc);
1052 1052  
1053 1053          CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
1054 1054              spa->spa_name);
1055 1055  
1056 1056          ASSERT(curproc != &p0);
1057 1057          (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
1058 1058              "zpool-%s", spa->spa_name);
1059 1059          (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
1060 1060  
1061 1061          /* bind this thread to the requested psrset */
1062 1062          if (zio_taskq_psrset_bind != PS_NONE) {
1063 1063                  pool_lock();
1064 1064                  mutex_enter(&cpu_lock);
1065 1065                  mutex_enter(&pidlock);
1066 1066                  mutex_enter(&curproc->p_lock);
1067 1067  
1068 1068                  if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
1069 1069                      0, NULL, NULL) == 0)  {
1070 1070                          curthread->t_bind_pset = zio_taskq_psrset_bind;
1071 1071                  } else {
1072 1072                          cmn_err(CE_WARN,
1073 1073                              "Couldn't bind process for zfs pool \"%s\" to "
1074 1074                              "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
1075 1075                  }
1076 1076  
1077 1077                  mutex_exit(&curproc->p_lock);
1078 1078                  mutex_exit(&pidlock);
1079 1079                  mutex_exit(&cpu_lock);
1080 1080                  pool_unlock();
1081 1081          }
1082 1082  
1083 1083          if (zio_taskq_sysdc) {
1084 1084                  sysdc_thread_enter(curthread, 100, 0);
1085 1085          }
1086 1086  
1087 1087          spa->spa_proc = curproc;
1088 1088          spa->spa_did = curthread->t_did;
1089 1089  
1090 1090          spa_create_zio_taskqs(spa);
1091 1091  
1092 1092          mutex_enter(&spa->spa_proc_lock);
1093 1093          ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
1094 1094  
1095 1095          spa->spa_proc_state = SPA_PROC_ACTIVE;
1096 1096          cv_broadcast(&spa->spa_proc_cv);
1097 1097  
1098 1098          CALLB_CPR_SAFE_BEGIN(&cprinfo);
1099 1099          while (spa->spa_proc_state == SPA_PROC_ACTIVE)
1100 1100                  cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1101 1101          CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
1102 1102  
1103 1103          ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
1104 1104          spa->spa_proc_state = SPA_PROC_GONE;
1105 1105          spa->spa_proc = &p0;
1106 1106          cv_broadcast(&spa->spa_proc_cv);
1107 1107          CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
1108 1108  
1109 1109          mutex_enter(&curproc->p_lock);
1110 1110          lwp_exit();
1111 1111  }
1112 1112  #endif
1113 1113  
1114 1114  /*
1115 1115   * Activate an uninitialized pool.
1116 1116   */
1117 1117  static void
1118 1118  spa_activate(spa_t *spa, int mode)
1119 1119  {
1120 1120          ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
1121 1121  
1122 1122          spa->spa_state = POOL_STATE_ACTIVE;
1123 1123          spa->spa_mode = mode;
1124 1124  
1125 1125          spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
1126 1126          spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
1127 1127  
1128 1128          /* Try to create a covering process */
1129 1129          mutex_enter(&spa->spa_proc_lock);
1130 1130          ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
1131 1131          ASSERT(spa->spa_proc == &p0);
1132 1132          spa->spa_did = 0;
1133 1133  
1134 1134          /* Only create a process if we're going to be around a while. */
1135 1135          if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
1136 1136                  if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
1137 1137                      NULL, 0) == 0) {
1138 1138                          spa->spa_proc_state = SPA_PROC_CREATED;
1139 1139                          while (spa->spa_proc_state == SPA_PROC_CREATED) {
1140 1140                                  cv_wait(&spa->spa_proc_cv,
1141 1141                                      &spa->spa_proc_lock);
1142 1142                          }
1143 1143                          ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1144 1144                          ASSERT(spa->spa_proc != &p0);
1145 1145                          ASSERT(spa->spa_did != 0);
1146 1146                  } else {
1147 1147  #ifdef _KERNEL
1148 1148                          cmn_err(CE_WARN,
1149 1149                              "Couldn't create process for zfs pool \"%s\"\n",
1150 1150                              spa->spa_name);
1151 1151  #endif
1152 1152                  }
1153 1153          }
1154 1154          mutex_exit(&spa->spa_proc_lock);
1155 1155  
1156 1156          /* If we didn't create a process, we need to create our taskqs. */
1157 1157          if (spa->spa_proc == &p0) {
1158 1158                  spa_create_zio_taskqs(spa);
1159 1159          }
1160 1160  
1161 1161          for (size_t i = 0; i < TXG_SIZE; i++) {
1162 1162                  spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
1163 1163                      ZIO_FLAG_CANFAIL);
1164 1164          }
1165 1165  
1166 1166          list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
1167 1167              offsetof(vdev_t, vdev_config_dirty_node));
1168 1168          list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
1169 1169              offsetof(objset_t, os_evicting_node));
1170 1170          list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
1171 1171              offsetof(vdev_t, vdev_state_dirty_node));
1172 1172  
1173 1173          txg_list_create(&spa->spa_vdev_txg_list, spa,
1174 1174              offsetof(struct vdev, vdev_txg_node));
1175 1175  
1176 1176          avl_create(&spa->spa_errlist_scrub,
1177 1177              spa_error_entry_compare, sizeof (spa_error_entry_t),
1178 1178              offsetof(spa_error_entry_t, se_avl));
1179 1179          avl_create(&spa->spa_errlist_last,
1180 1180              spa_error_entry_compare, sizeof (spa_error_entry_t),
1181 1181              offsetof(spa_error_entry_t, se_avl));
1182 1182  }
1183 1183  
1184 1184  /*
1185 1185   * Opposite of spa_activate().
1186 1186   */
1187 1187  static void
1188 1188  spa_deactivate(spa_t *spa)
1189 1189  {
1190 1190          ASSERT(spa->spa_sync_on == B_FALSE);
1191 1191          ASSERT(spa->spa_dsl_pool == NULL);
1192 1192          ASSERT(spa->spa_root_vdev == NULL);
1193 1193          ASSERT(spa->spa_async_zio_root == NULL);
1194 1194          ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
1195 1195  
1196 1196          spa_evicting_os_wait(spa);
1197 1197  
1198 1198          txg_list_destroy(&spa->spa_vdev_txg_list);
1199 1199  
1200 1200          list_destroy(&spa->spa_config_dirty_list);
1201 1201          list_destroy(&spa->spa_evicting_os_list);
1202 1202          list_destroy(&spa->spa_state_dirty_list);
1203 1203  
1204 1204          for (int t = 0; t < ZIO_TYPES; t++) {
1205 1205                  for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
1206 1206                          spa_taskqs_fini(spa, t, q);
1207 1207                  }
1208 1208          }
1209 1209  
1210 1210          for (size_t i = 0; i < TXG_SIZE; i++) {
1211 1211                  ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
1212 1212                  VERIFY0(zio_wait(spa->spa_txg_zio[i]));
1213 1213                  spa->spa_txg_zio[i] = NULL;
1214 1214          }
1215 1215  
1216 1216          metaslab_class_destroy(spa->spa_normal_class);
1217 1217          spa->spa_normal_class = NULL;
1218 1218  
1219 1219          metaslab_class_destroy(spa->spa_log_class);
1220 1220          spa->spa_log_class = NULL;
1221 1221  
1222 1222          /*
1223 1223           * If this was part of an import or the open otherwise failed, we may
1224 1224           * still have errors left in the queues.  Empty them just in case.
1225 1225           */
1226 1226          spa_errlog_drain(spa);
1227 1227  
1228 1228          avl_destroy(&spa->spa_errlist_scrub);
1229 1229          avl_destroy(&spa->spa_errlist_last);
1230 1230  
1231 1231          spa->spa_state = POOL_STATE_UNINITIALIZED;
1232 1232  
1233 1233          mutex_enter(&spa->spa_proc_lock);
1234 1234          if (spa->spa_proc_state != SPA_PROC_NONE) {
1235 1235                  ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
1236 1236                  spa->spa_proc_state = SPA_PROC_DEACTIVATE;
1237 1237                  cv_broadcast(&spa->spa_proc_cv);
1238 1238                  while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
1239 1239                          ASSERT(spa->spa_proc != &p0);
1240 1240                          cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
1241 1241                  }
1242 1242                  ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
1243 1243                  spa->spa_proc_state = SPA_PROC_NONE;
1244 1244          }
1245 1245          ASSERT(spa->spa_proc == &p0);
1246 1246          mutex_exit(&spa->spa_proc_lock);
1247 1247  
1248 1248          /*
1249 1249           * We want to make sure spa_thread() has actually exited the ZFS
1250 1250           * module, so that the module can't be unloaded out from underneath
1251 1251           * it.
1252 1252           */
1253 1253          if (spa->spa_did != 0) {
1254 1254                  thread_join(spa->spa_did);
1255 1255                  spa->spa_did = 0;
1256 1256          }
1257 1257  }
1258 1258  
1259 1259  /*
1260 1260   * Verify a pool configuration, and construct the vdev tree appropriately.  This
1261 1261   * will create all the necessary vdevs in the appropriate layout, with each vdev
1262 1262   * in the CLOSED state.  This will prep the pool before open/creation/import.
1263 1263   * All vdev validation is done by the vdev_alloc() routine.
1264 1264   */
1265 1265  static int
1266 1266  spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
1267 1267      uint_t id, int atype)
1268 1268  {
1269 1269          nvlist_t **child;
1270 1270          uint_t children;
1271 1271          int error;
1272 1272  
1273 1273          if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
1274 1274                  return (error);
1275 1275  
1276 1276          if ((*vdp)->vdev_ops->vdev_op_leaf)
1277 1277                  return (0);
1278 1278  
1279 1279          error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1280 1280              &child, &children);
1281 1281  
1282 1282          if (error == ENOENT)
1283 1283                  return (0);
1284 1284  
1285 1285          if (error) {
1286 1286                  vdev_free(*vdp);
1287 1287                  *vdp = NULL;
1288 1288                  return (SET_ERROR(EINVAL));
1289 1289          }
1290 1290  
1291 1291          for (int c = 0; c < children; c++) {
1292 1292                  vdev_t *vd;
1293 1293                  if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
1294 1294                      atype)) != 0) {
1295 1295                          vdev_free(*vdp);
1296 1296                          *vdp = NULL;
1297 1297                          return (error);
1298 1298                  }
1299 1299          }
1300 1300  
1301 1301          ASSERT(*vdp != NULL);
1302 1302  
1303 1303          return (0);
1304 1304  }
1305 1305  
1306 1306  /*
1307 1307   * Opposite of spa_load().
1308 1308   */
1309 1309  static void
1310 1310  spa_unload(spa_t *spa)
1311 1311  {
1312 1312          int i;
1313 1313  
1314 1314          ASSERT(MUTEX_HELD(&spa_namespace_lock));
1315 1315  
1316 1316          spa_load_note(spa, "UNLOADING");
1317 1317  
1318 1318          /*
1319 1319           * Stop async tasks.
1320 1320           */
1321 1321          spa_async_suspend(spa);
1322 1322  
1323 1323          if (spa->spa_root_vdev) {
1324 1324                  vdev_initialize_stop_all(spa->spa_root_vdev,
1325 1325                      VDEV_INITIALIZE_ACTIVE);
1326 1326          }
1327 1327  
1328 1328          /*
1329 1329           * Stop syncing.
1330 1330           */
1331 1331          if (spa->spa_sync_on) {
1332 1332                  txg_sync_stop(spa->spa_dsl_pool);
1333 1333                  spa->spa_sync_on = B_FALSE;
1334 1334          }
1335 1335  
1336 1336          /*
1337 1337           * Even though vdev_free() also calls vdev_metaslab_fini, we need
1338 1338           * to call it earlier, before we wait for async i/o to complete.
1339 1339           * This ensures that there is no async metaslab prefetching, by
1340 1340           * calling taskq_wait(mg_taskq).
1341 1341           */
1342 1342          if (spa->spa_root_vdev != NULL) {
1343 1343                  spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1344 1344                  for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++)
1345 1345                          vdev_metaslab_fini(spa->spa_root_vdev->vdev_child[c]);
1346 1346                  spa_config_exit(spa, SCL_ALL, spa);
1347 1347          }
1348 1348  
1349 1349          /*
1350 1350           * Wait for any outstanding async I/O to complete.
1351 1351           */
1352 1352          if (spa->spa_async_zio_root != NULL) {
1353 1353                  for (int i = 0; i < max_ncpus; i++)
1354 1354                          (void) zio_wait(spa->spa_async_zio_root[i]);
1355 1355                  kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
1356 1356                  spa->spa_async_zio_root = NULL;
1357 1357          }
1358 1358  
1359 1359          if (spa->spa_vdev_removal != NULL) {
1360 1360                  spa_vdev_removal_destroy(spa->spa_vdev_removal);
1361 1361                  spa->spa_vdev_removal = NULL;
1362 1362          }
1363 1363  
1364 1364          if (spa->spa_condense_zthr != NULL) {
1365 1365                  ASSERT(!zthr_isrunning(spa->spa_condense_zthr));
1366 1366                  zthr_destroy(spa->spa_condense_zthr);
1367 1367                  spa->spa_condense_zthr = NULL;
1368 1368          }
1369 1369  
1370 1370          if (spa->spa_checkpoint_discard_zthr != NULL) {
1371 1371                  ASSERT(!zthr_isrunning(spa->spa_checkpoint_discard_zthr));
1372 1372                  zthr_destroy(spa->spa_checkpoint_discard_zthr);
1373 1373                  spa->spa_checkpoint_discard_zthr = NULL;
1374 1374          }
1375 1375  
1376 1376          spa_condense_fini(spa);
1377 1377  
1378 1378          bpobj_close(&spa->spa_deferred_bpobj);
1379 1379  
1380 1380          spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
1381 1381  
1382 1382          /*
1383 1383           * Close all vdevs.
1384 1384           */
1385 1385          if (spa->spa_root_vdev)
1386 1386                  vdev_free(spa->spa_root_vdev);
1387 1387          ASSERT(spa->spa_root_vdev == NULL);
1388 1388  
1389 1389          /*
1390 1390           * Close the dsl pool.
1391 1391           */
1392 1392          if (spa->spa_dsl_pool) {
1393 1393                  dsl_pool_close(spa->spa_dsl_pool);
1394 1394                  spa->spa_dsl_pool = NULL;
1395 1395                  spa->spa_meta_objset = NULL;
1396 1396          }
1397 1397  
1398 1398          ddt_unload(spa);
1399 1399  
1400 1400          /*
1401 1401           * Drop and purge level 2 cache
1402 1402           */
1403 1403          spa_l2cache_drop(spa);
1404 1404  
1405 1405          for (i = 0; i < spa->spa_spares.sav_count; i++)
1406 1406                  vdev_free(spa->spa_spares.sav_vdevs[i]);
1407 1407          if (spa->spa_spares.sav_vdevs) {
1408 1408                  kmem_free(spa->spa_spares.sav_vdevs,
1409 1409                      spa->spa_spares.sav_count * sizeof (void *));
1410 1410                  spa->spa_spares.sav_vdevs = NULL;
1411 1411          }
1412 1412          if (spa->spa_spares.sav_config) {
1413 1413                  nvlist_free(spa->spa_spares.sav_config);
1414 1414                  spa->spa_spares.sav_config = NULL;
1415 1415          }
1416 1416          spa->spa_spares.sav_count = 0;
1417 1417  
1418 1418          for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1419 1419                  vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1420 1420                  vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1421 1421          }
1422 1422          if (spa->spa_l2cache.sav_vdevs) {
1423 1423                  kmem_free(spa->spa_l2cache.sav_vdevs,
1424 1424                      spa->spa_l2cache.sav_count * sizeof (void *));
1425 1425                  spa->spa_l2cache.sav_vdevs = NULL;
1426 1426          }
1427 1427          if (spa->spa_l2cache.sav_config) {
1428 1428                  nvlist_free(spa->spa_l2cache.sav_config);
1429 1429                  spa->spa_l2cache.sav_config = NULL;
1430 1430          }
1431 1431          spa->spa_l2cache.sav_count = 0;
1432 1432  
1433 1433          spa->spa_async_suspended = 0;
1434 1434  
1435 1435          spa->spa_indirect_vdevs_loaded = B_FALSE;
1436 1436  
1437 1437          if (spa->spa_comment != NULL) {
1438 1438                  spa_strfree(spa->spa_comment);
1439 1439                  spa->spa_comment = NULL;
1440 1440          }
1441 1441  
1442 1442          spa_config_exit(spa, SCL_ALL, spa);
1443 1443  }
1444 1444  
1445 1445  /*
1446 1446   * Load (or re-load) the current list of vdevs describing the active spares for
1447 1447   * this pool.  When this is called, we have some form of basic information in
1448 1448   * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1449 1449   * then re-generate a more complete list including status information.
1450 1450   */
1451 1451  void
1452 1452  spa_load_spares(spa_t *spa)
1453 1453  {
1454 1454          nvlist_t **spares;
1455 1455          uint_t nspares;
1456 1456          int i;
1457 1457          vdev_t *vd, *tvd;
1458 1458  
1459 1459  #ifndef _KERNEL
1460 1460          /*
1461 1461           * zdb opens both the current state of the pool and the
1462 1462           * checkpointed state (if present), with a different spa_t.
1463 1463           *
1464 1464           * As spare vdevs are shared among open pools, we skip loading
1465 1465           * them when we load the checkpointed state of the pool.
1466 1466           */
1467 1467          if (!spa_writeable(spa))
1468 1468                  return;
1469 1469  #endif
1470 1470  
1471 1471          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1472 1472  
1473 1473          /*
1474 1474           * First, close and free any existing spare vdevs.
1475 1475           */
1476 1476          for (i = 0; i < spa->spa_spares.sav_count; i++) {
1477 1477                  vd = spa->spa_spares.sav_vdevs[i];
1478 1478  
1479 1479                  /* Undo the call to spa_activate() below */
1480 1480                  if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1481 1481                      B_FALSE)) != NULL && tvd->vdev_isspare)
1482 1482                          spa_spare_remove(tvd);
1483 1483                  vdev_close(vd);
1484 1484                  vdev_free(vd);
1485 1485          }
1486 1486  
1487 1487          if (spa->spa_spares.sav_vdevs)
1488 1488                  kmem_free(spa->spa_spares.sav_vdevs,
1489 1489                      spa->spa_spares.sav_count * sizeof (void *));
1490 1490  
1491 1491          if (spa->spa_spares.sav_config == NULL)
1492 1492                  nspares = 0;
1493 1493          else
1494 1494                  VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1495 1495                      ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1496 1496  
1497 1497          spa->spa_spares.sav_count = (int)nspares;
1498 1498          spa->spa_spares.sav_vdevs = NULL;
1499 1499  
1500 1500          if (nspares == 0)
1501 1501                  return;
1502 1502  
1503 1503          /*
1504 1504           * Construct the array of vdevs, opening them to get status in the
1505 1505           * process.   For each spare, there is potentially two different vdev_t
1506 1506           * structures associated with it: one in the list of spares (used only
1507 1507           * for basic validation purposes) and one in the active vdev
1508 1508           * configuration (if it's spared in).  During this phase we open and
1509 1509           * validate each vdev on the spare list.  If the vdev also exists in the
1510 1510           * active configuration, then we also mark this vdev as an active spare.
1511 1511           */
1512 1512          spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1513 1513              KM_SLEEP);
1514 1514          for (i = 0; i < spa->spa_spares.sav_count; i++) {
1515 1515                  VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1516 1516                      VDEV_ALLOC_SPARE) == 0);
1517 1517                  ASSERT(vd != NULL);
1518 1518  
1519 1519                  spa->spa_spares.sav_vdevs[i] = vd;
1520 1520  
1521 1521                  if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1522 1522                      B_FALSE)) != NULL) {
1523 1523                          if (!tvd->vdev_isspare)
1524 1524                                  spa_spare_add(tvd);
1525 1525  
1526 1526                          /*
1527 1527                           * We only mark the spare active if we were successfully
1528 1528                           * able to load the vdev.  Otherwise, importing a pool
1529 1529                           * with a bad active spare would result in strange
1530 1530                           * behavior, because multiple pool would think the spare
1531 1531                           * is actively in use.
1532 1532                           *
1533 1533                           * There is a vulnerability here to an equally bizarre
1534 1534                           * circumstance, where a dead active spare is later
1535 1535                           * brought back to life (onlined or otherwise).  Given
1536 1536                           * the rarity of this scenario, and the extra complexity
1537 1537                           * it adds, we ignore the possibility.
1538 1538                           */
1539 1539                          if (!vdev_is_dead(tvd))
1540 1540                                  spa_spare_activate(tvd);
1541 1541                  }
1542 1542  
1543 1543                  vd->vdev_top = vd;
1544 1544                  vd->vdev_aux = &spa->spa_spares;
1545 1545  
1546 1546                  if (vdev_open(vd) != 0)
1547 1547                          continue;
1548 1548  
1549 1549                  if (vdev_validate_aux(vd) == 0)
1550 1550                          spa_spare_add(vd);
1551 1551          }
1552 1552  
1553 1553          /*
1554 1554           * Recompute the stashed list of spares, with status information
1555 1555           * this time.
1556 1556           */
1557 1557          VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1558 1558              DATA_TYPE_NVLIST_ARRAY) == 0);
1559 1559  
1560 1560          spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1561 1561              KM_SLEEP);
1562 1562          for (i = 0; i < spa->spa_spares.sav_count; i++)
1563 1563                  spares[i] = vdev_config_generate(spa,
1564 1564                      spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1565 1565          VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1566 1566              ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1567 1567          for (i = 0; i < spa->spa_spares.sav_count; i++)
1568 1568                  nvlist_free(spares[i]);
1569 1569          kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1570 1570  }
1571 1571  
1572 1572  /*
1573 1573   * Load (or re-load) the current list of vdevs describing the active l2cache for
1574 1574   * this pool.  When this is called, we have some form of basic information in
1575 1575   * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1576 1576   * then re-generate a more complete list including status information.
1577 1577   * Devices which are already active have their details maintained, and are
1578 1578   * not re-opened.
1579 1579   */
1580 1580  void
1581 1581  spa_load_l2cache(spa_t *spa)
1582 1582  {
1583 1583          nvlist_t **l2cache;
1584 1584          uint_t nl2cache;
1585 1585          int i, j, oldnvdevs;
1586 1586          uint64_t guid;
1587 1587          vdev_t *vd, **oldvdevs, **newvdevs;
1588 1588          spa_aux_vdev_t *sav = &spa->spa_l2cache;
1589 1589  
1590 1590  #ifndef _KERNEL
1591 1591          /*
1592 1592           * zdb opens both the current state of the pool and the
1593 1593           * checkpointed state (if present), with a different spa_t.
1594 1594           *
1595 1595           * As L2 caches are part of the ARC which is shared among open
1596 1596           * pools, we skip loading them when we load the checkpointed
1597 1597           * state of the pool.
1598 1598           */
1599 1599          if (!spa_writeable(spa))
1600 1600                  return;
1601 1601  #endif
1602 1602  
1603 1603          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1604 1604  
1605 1605          if (sav->sav_config != NULL) {
1606 1606                  VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1607 1607                      ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1608 1608                  newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1609 1609          } else {
1610 1610                  nl2cache = 0;
1611 1611                  newvdevs = NULL;
1612 1612          }
1613 1613  
1614 1614          oldvdevs = sav->sav_vdevs;
1615 1615          oldnvdevs = sav->sav_count;
1616 1616          sav->sav_vdevs = NULL;
1617 1617          sav->sav_count = 0;
1618 1618  
1619 1619          /*
1620 1620           * Process new nvlist of vdevs.
1621 1621           */
1622 1622          for (i = 0; i < nl2cache; i++) {
1623 1623                  VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1624 1624                      &guid) == 0);
1625 1625  
1626 1626                  newvdevs[i] = NULL;
1627 1627                  for (j = 0; j < oldnvdevs; j++) {
1628 1628                          vd = oldvdevs[j];
1629 1629                          if (vd != NULL && guid == vd->vdev_guid) {
1630 1630                                  /*
1631 1631                                   * Retain previous vdev for add/remove ops.
1632 1632                                   */
1633 1633                                  newvdevs[i] = vd;
1634 1634                                  oldvdevs[j] = NULL;
1635 1635                                  break;
1636 1636                          }
1637 1637                  }
1638 1638  
1639 1639                  if (newvdevs[i] == NULL) {
1640 1640                          /*
1641 1641                           * Create new vdev
1642 1642                           */
1643 1643                          VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1644 1644                              VDEV_ALLOC_L2CACHE) == 0);
1645 1645                          ASSERT(vd != NULL);
1646 1646                          newvdevs[i] = vd;
1647 1647  
1648 1648                          /*
1649 1649                           * Commit this vdev as an l2cache device,
1650 1650                           * even if it fails to open.
1651 1651                           */
1652 1652                          spa_l2cache_add(vd);
1653 1653  
1654 1654                          vd->vdev_top = vd;
1655 1655                          vd->vdev_aux = sav;
1656 1656  
1657 1657                          spa_l2cache_activate(vd);
1658 1658  
1659 1659                          if (vdev_open(vd) != 0)
1660 1660                                  continue;
1661 1661  
1662 1662                          (void) vdev_validate_aux(vd);
1663 1663  
1664 1664                          if (!vdev_is_dead(vd))
1665 1665                                  l2arc_add_vdev(spa, vd);
1666 1666                  }
1667 1667          }
1668 1668  
1669 1669          /*
1670 1670           * Purge vdevs that were dropped
1671 1671           */
1672 1672          for (i = 0; i < oldnvdevs; i++) {
1673 1673                  uint64_t pool;
1674 1674  
1675 1675                  vd = oldvdevs[i];
1676 1676                  if (vd != NULL) {
1677 1677                          ASSERT(vd->vdev_isl2cache);
1678 1678  
1679 1679                          if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1680 1680                              pool != 0ULL && l2arc_vdev_present(vd))
1681 1681                                  l2arc_remove_vdev(vd);
1682 1682                          vdev_clear_stats(vd);
1683 1683                          vdev_free(vd);
1684 1684                  }
1685 1685          }
1686 1686  
1687 1687          if (oldvdevs)
1688 1688                  kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1689 1689  
1690 1690          if (sav->sav_config == NULL)
1691 1691                  goto out;
1692 1692  
1693 1693          sav->sav_vdevs = newvdevs;
1694 1694          sav->sav_count = (int)nl2cache;
1695 1695  
1696 1696          /*
1697 1697           * Recompute the stashed list of l2cache devices, with status
1698 1698           * information this time.
1699 1699           */
1700 1700          VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1701 1701              DATA_TYPE_NVLIST_ARRAY) == 0);
1702 1702  
1703 1703          l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1704 1704          for (i = 0; i < sav->sav_count; i++)
1705 1705                  l2cache[i] = vdev_config_generate(spa,
1706 1706                      sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1707 1707          VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1708 1708              ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1709 1709  out:
1710 1710          for (i = 0; i < sav->sav_count; i++)
1711 1711                  nvlist_free(l2cache[i]);
1712 1712          if (sav->sav_count)
1713 1713                  kmem_free(l2cache, sav->sav_count * sizeof (void *));
1714 1714  }
1715 1715  
1716 1716  static int
1717 1717  load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1718 1718  {
1719 1719          dmu_buf_t *db;
1720 1720          char *packed = NULL;
1721 1721          size_t nvsize = 0;
1722 1722          int error;
1723 1723          *value = NULL;
1724 1724  
1725 1725          error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
1726 1726          if (error != 0)
1727 1727                  return (error);
1728 1728  
1729 1729          nvsize = *(uint64_t *)db->db_data;
1730 1730          dmu_buf_rele(db, FTAG);
1731 1731  
1732 1732          packed = kmem_alloc(nvsize, KM_SLEEP);
1733 1733          error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1734 1734              DMU_READ_PREFETCH);
1735 1735          if (error == 0)
1736 1736                  error = nvlist_unpack(packed, nvsize, value, 0);
1737 1737          kmem_free(packed, nvsize);
1738 1738  
1739 1739          return (error);
1740 1740  }
1741 1741  
1742 1742  /*
1743 1743   * Concrete top-level vdevs that are not missing and are not logs. At every
1744 1744   * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
1745 1745   */
1746 1746  static uint64_t
1747 1747  spa_healthy_core_tvds(spa_t *spa)
1748 1748  {
1749 1749          vdev_t *rvd = spa->spa_root_vdev;
1750 1750          uint64_t tvds = 0;
1751 1751  
1752 1752          for (uint64_t i = 0; i < rvd->vdev_children; i++) {
1753 1753                  vdev_t *vd = rvd->vdev_child[i];
1754 1754                  if (vd->vdev_islog)
1755 1755                          continue;
1756 1756                  if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
1757 1757                          tvds++;
1758 1758          }
1759 1759  
1760 1760          return (tvds);
1761 1761  }
1762 1762  
1763 1763  /*
1764 1764   * Checks to see if the given vdev could not be opened, in which case we post a
1765 1765   * sysevent to notify the autoreplace code that the device has been removed.
1766 1766   */
1767 1767  static void
1768 1768  spa_check_removed(vdev_t *vd)
1769 1769  {
1770 1770          for (uint64_t c = 0; c < vd->vdev_children; c++)
1771 1771                  spa_check_removed(vd->vdev_child[c]);
1772 1772  
1773 1773          if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
1774 1774              vdev_is_concrete(vd)) {
1775 1775                  zfs_post_autoreplace(vd->vdev_spa, vd);
1776 1776                  spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
1777 1777          }
1778 1778  }
1779 1779  
1780 1780  static int
1781 1781  spa_check_for_missing_logs(spa_t *spa)
1782 1782  {
1783 1783          vdev_t *rvd = spa->spa_root_vdev;
1784 1784  
1785 1785          /*
1786 1786           * If we're doing a normal import, then build up any additional
1787 1787           * diagnostic information about missing log devices.
1788 1788           * We'll pass this up to the user for further processing.
1789 1789           */
1790 1790          if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1791 1791                  nvlist_t **child, *nv;
1792 1792                  uint64_t idx = 0;
1793 1793  
1794 1794                  child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1795 1795                      KM_SLEEP);
1796 1796                  VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1797 1797  
1798 1798                  for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1799 1799                          vdev_t *tvd = rvd->vdev_child[c];
1800 1800  
1801 1801                          /*
1802 1802                           * We consider a device as missing only if it failed
1803 1803                           * to open (i.e. offline or faulted is not considered
1804 1804                           * as missing).
1805 1805                           */
1806 1806                          if (tvd->vdev_islog &&
1807 1807                              tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1808 1808                                  child[idx++] = vdev_config_generate(spa, tvd,
1809 1809                                      B_FALSE, VDEV_CONFIG_MISSING);
1810 1810                          }
1811 1811                  }
1812 1812  
1813 1813                  if (idx > 0) {
1814 1814                          fnvlist_add_nvlist_array(nv,
1815 1815                              ZPOOL_CONFIG_CHILDREN, child, idx);
1816 1816                          fnvlist_add_nvlist(spa->spa_load_info,
1817 1817                              ZPOOL_CONFIG_MISSING_DEVICES, nv);
1818 1818  
1819 1819                          for (uint64_t i = 0; i < idx; i++)
1820 1820                                  nvlist_free(child[i]);
1821 1821                  }
1822 1822                  nvlist_free(nv);
1823 1823                  kmem_free(child, rvd->vdev_children * sizeof (char **));
1824 1824  
1825 1825                  if (idx > 0) {
1826 1826                          spa_load_failed(spa, "some log devices are missing");
1827 1827                          vdev_dbgmsg_print_tree(rvd, 2);
1828 1828                          return (SET_ERROR(ENXIO));
1829 1829                  }
1830 1830          } else {
1831 1831                  for (uint64_t c = 0; c < rvd->vdev_children; c++) {
1832 1832                          vdev_t *tvd = rvd->vdev_child[c];
1833 1833  
1834 1834                          if (tvd->vdev_islog &&
1835 1835                              tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
1836 1836                                  spa_set_log_state(spa, SPA_LOG_CLEAR);
1837 1837                                  spa_load_note(spa, "some log devices are "
1838 1838                                      "missing, ZIL is dropped.");
1839 1839                                  vdev_dbgmsg_print_tree(rvd, 2);
1840 1840                                  break;
1841 1841                          }
1842 1842                  }
1843 1843          }
1844 1844  
1845 1845          return (0);
1846 1846  }
1847 1847  
1848 1848  /*
1849 1849   * Check for missing log devices
1850 1850   */
1851 1851  static boolean_t
1852 1852  spa_check_logs(spa_t *spa)
1853 1853  {
1854 1854          boolean_t rv = B_FALSE;
1855 1855          dsl_pool_t *dp = spa_get_dsl(spa);
1856 1856  
1857 1857          switch (spa->spa_log_state) {
1858 1858          case SPA_LOG_MISSING:
1859 1859                  /* need to recheck in case slog has been restored */
1860 1860          case SPA_LOG_UNKNOWN:
1861 1861                  rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
1862 1862                      zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
1863 1863                  if (rv)
1864 1864                          spa_set_log_state(spa, SPA_LOG_MISSING);
1865 1865                  break;
1866 1866          }
1867 1867          return (rv);
1868 1868  }
1869 1869  
1870 1870  static boolean_t
1871 1871  spa_passivate_log(spa_t *spa)
1872 1872  {
1873 1873          vdev_t *rvd = spa->spa_root_vdev;
1874 1874          boolean_t slog_found = B_FALSE;
1875 1875  
1876 1876          ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1877 1877  
1878 1878          if (!spa_has_slogs(spa))
1879 1879                  return (B_FALSE);
1880 1880  
1881 1881          for (int c = 0; c < rvd->vdev_children; c++) {
1882 1882                  vdev_t *tvd = rvd->vdev_child[c];
1883 1883                  metaslab_group_t *mg = tvd->vdev_mg;
1884 1884  
1885 1885                  if (tvd->vdev_islog) {
1886 1886                          metaslab_group_passivate(mg);
1887 1887                          slog_found = B_TRUE;
1888 1888                  }
1889 1889          }
1890 1890  
1891 1891          return (slog_found);
1892 1892  }
1893 1893  
1894 1894  static void
1895 1895  spa_activate_log(spa_t *spa)
1896 1896  {
1897 1897          vdev_t *rvd = spa->spa_root_vdev;
1898 1898  
1899 1899          ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1900 1900  
1901 1901          for (int c = 0; c < rvd->vdev_children; c++) {
1902 1902                  vdev_t *tvd = rvd->vdev_child[c];
1903 1903                  metaslab_group_t *mg = tvd->vdev_mg;
1904 1904  
1905 1905                  if (tvd->vdev_islog)
1906 1906                          metaslab_group_activate(mg);
1907 1907          }
1908 1908  }
1909 1909  
1910 1910  int
1911 1911  spa_reset_logs(spa_t *spa)
1912 1912  {
1913 1913          int error;
1914 1914  
1915 1915          error = dmu_objset_find(spa_name(spa), zil_reset,
1916 1916              NULL, DS_FIND_CHILDREN);
1917 1917          if (error == 0) {
1918 1918                  /*
1919 1919                   * We successfully offlined the log device, sync out the
1920 1920                   * current txg so that the "stubby" block can be removed
1921 1921                   * by zil_sync().
1922 1922                   */
1923 1923                  txg_wait_synced(spa->spa_dsl_pool, 0);
1924 1924          }
1925 1925          return (error);
1926 1926  }
1927 1927  
1928 1928  static void
1929 1929  spa_aux_check_removed(spa_aux_vdev_t *sav)
1930 1930  {
1931 1931          for (int i = 0; i < sav->sav_count; i++)
1932 1932                  spa_check_removed(sav->sav_vdevs[i]);
1933 1933  }
1934 1934  
1935 1935  void
1936 1936  spa_claim_notify(zio_t *zio)
1937 1937  {
1938 1938          spa_t *spa = zio->io_spa;
1939 1939  
1940 1940          if (zio->io_error)
1941 1941                  return;
1942 1942  
1943 1943          mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
1944 1944          if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1945 1945                  spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1946 1946          mutex_exit(&spa->spa_props_lock);
1947 1947  }
1948 1948  
1949 1949  typedef struct spa_load_error {
1950 1950          uint64_t        sle_meta_count;
1951 1951          uint64_t        sle_data_count;
1952 1952  } spa_load_error_t;
1953 1953  
1954 1954  static void
1955 1955  spa_load_verify_done(zio_t *zio)
1956 1956  {
1957 1957          blkptr_t *bp = zio->io_bp;
1958 1958          spa_load_error_t *sle = zio->io_private;
1959 1959          dmu_object_type_t type = BP_GET_TYPE(bp);
1960 1960          int error = zio->io_error;
1961 1961          spa_t *spa = zio->io_spa;
1962 1962  
1963 1963          abd_free(zio->io_abd);
1964 1964          if (error) {
1965 1965                  if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1966 1966                      type != DMU_OT_INTENT_LOG)
1967 1967                          atomic_inc_64(&sle->sle_meta_count);
1968 1968                  else
1969 1969                          atomic_inc_64(&sle->sle_data_count);
1970 1970          }
1971 1971  
1972 1972          mutex_enter(&spa->spa_scrub_lock);
1973 1973          spa->spa_scrub_inflight--;
1974 1974          cv_broadcast(&spa->spa_scrub_io_cv);
1975 1975          mutex_exit(&spa->spa_scrub_lock);
1976 1976  }
1977 1977  
1978 1978  /*
1979 1979   * Maximum number of concurrent scrub i/os to create while verifying
1980 1980   * a pool while importing it.
1981 1981   */
1982 1982  int spa_load_verify_maxinflight = 10000;
1983 1983  boolean_t spa_load_verify_metadata = B_TRUE;
1984 1984  boolean_t spa_load_verify_data = B_TRUE;
1985 1985  
1986 1986  /*ARGSUSED*/
1987 1987  static int
1988 1988  spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1989 1989      const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
1990 1990  {
1991 1991          if (bp == NULL || BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
1992 1992                  return (0);
1993 1993          /*
1994 1994           * Note: normally this routine will not be called if
1995 1995           * spa_load_verify_metadata is not set.  However, it may be useful
1996 1996           * to manually set the flag after the traversal has begun.
1997 1997           */
1998 1998          if (!spa_load_verify_metadata)
1999 1999                  return (0);
2000 2000          if (!BP_IS_METADATA(bp) && !spa_load_verify_data)
2001 2001                  return (0);
2002 2002  
2003 2003          zio_t *rio = arg;
2004 2004          size_t size = BP_GET_PSIZE(bp);
2005 2005  
2006 2006          mutex_enter(&spa->spa_scrub_lock);
2007 2007          while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
2008 2008                  cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
2009 2009          spa->spa_scrub_inflight++;
2010 2010          mutex_exit(&spa->spa_scrub_lock);
2011 2011  
2012 2012          zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
2013 2013              spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
2014 2014              ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
2015 2015              ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
2016 2016          return (0);
2017 2017  }
2018 2018  
2019 2019  /* ARGSUSED */
2020 2020  int
2021 2021  verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
2022 2022  {
2023 2023          if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
2024 2024                  return (SET_ERROR(ENAMETOOLONG));
2025 2025  
2026 2026          return (0);
2027 2027  }
2028 2028  
2029 2029  static int
2030 2030  spa_load_verify(spa_t *spa)
2031 2031  {
2032 2032          zio_t *rio;
2033 2033          spa_load_error_t sle = { 0 };
2034 2034          zpool_load_policy_t policy;
2035 2035          boolean_t verify_ok = B_FALSE;
2036 2036          int error = 0;
2037 2037  
2038 2038          zpool_get_load_policy(spa->spa_config, &policy);
2039 2039  
2040 2040          if (policy.zlp_rewind & ZPOOL_NEVER_REWIND)
2041 2041                  return (0);
2042 2042  
2043 2043          dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
2044 2044          error = dmu_objset_find_dp(spa->spa_dsl_pool,
2045 2045              spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
2046 2046              DS_FIND_CHILDREN);
2047 2047          dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
2048 2048          if (error != 0)
2049 2049                  return (error);
2050 2050  
2051 2051          rio = zio_root(spa, NULL, &sle,
2052 2052              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
2053 2053  
2054 2054          if (spa_load_verify_metadata) {
2055 2055                  if (spa->spa_extreme_rewind) {
2056 2056                          spa_load_note(spa, "performing a complete scan of the "
2057 2057                              "pool since extreme rewind is on. This may take "
2058 2058                              "a very long time.\n  (spa_load_verify_data=%u, "
2059 2059                              "spa_load_verify_metadata=%u)",
2060 2060                              spa_load_verify_data, spa_load_verify_metadata);
2061 2061                  }
2062 2062                  error = traverse_pool(spa, spa->spa_verify_min_txg,
2063 2063                      TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
2064 2064                      spa_load_verify_cb, rio);
2065 2065          }
2066 2066  
2067 2067          (void) zio_wait(rio);
2068 2068  
2069 2069          spa->spa_load_meta_errors = sle.sle_meta_count;
2070 2070          spa->spa_load_data_errors = sle.sle_data_count;
2071 2071  
2072 2072          if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
2073 2073                  spa_load_note(spa, "spa_load_verify found %llu metadata errors "
2074 2074                      "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
2075 2075                      (u_longlong_t)sle.sle_data_count);
2076 2076          }
2077 2077  
2078 2078          if (spa_load_verify_dryrun ||
2079 2079              (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
2080 2080              sle.sle_data_count <= policy.zlp_maxdata)) {
2081 2081                  int64_t loss = 0;
2082 2082  
2083 2083                  verify_ok = B_TRUE;
2084 2084                  spa->spa_load_txg = spa->spa_uberblock.ub_txg;
2085 2085                  spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
2086 2086  
2087 2087                  loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
2088 2088                  VERIFY(nvlist_add_uint64(spa->spa_load_info,
2089 2089                      ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
2090 2090                  VERIFY(nvlist_add_int64(spa->spa_load_info,
2091 2091                      ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
2092 2092                  VERIFY(nvlist_add_uint64(spa->spa_load_info,
2093 2093                      ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
2094 2094          } else {
2095 2095                  spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
2096 2096          }
2097 2097  
2098 2098          if (spa_load_verify_dryrun)
2099 2099                  return (0);
2100 2100  
2101 2101          if (error) {
2102 2102                  if (error != ENXIO && error != EIO)
2103 2103                          error = SET_ERROR(EIO);
2104 2104                  return (error);
2105 2105          }
2106 2106  
2107 2107          return (verify_ok ? 0 : EIO);
2108 2108  }
2109 2109  
2110 2110  /*
2111 2111   * Find a value in the pool props object.
2112 2112   */
2113 2113  static void
2114 2114  spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
2115 2115  {
2116 2116          (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
2117 2117              zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
2118 2118  }
2119 2119  
2120 2120  /*
2121 2121   * Find a value in the pool directory object.
2122 2122   */
2123 2123  static int
2124 2124  spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
2125 2125  {
2126 2126          int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
2127 2127              name, sizeof (uint64_t), 1, val);
2128 2128  
2129 2129          if (error != 0 && (error != ENOENT || log_enoent)) {
2130 2130                  spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
2131 2131                      "[error=%d]", name, error);
2132 2132          }
2133 2133  
2134 2134          return (error);
2135 2135  }
2136 2136  
2137 2137  static int
2138 2138  spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
2139 2139  {
2140 2140          vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
2141 2141          return (SET_ERROR(err));
2142 2142  }
2143 2143  
2144 2144  static void
2145 2145  spa_spawn_aux_threads(spa_t *spa)
2146 2146  {
2147 2147          ASSERT(spa_writeable(spa));
2148 2148  
2149 2149          ASSERT(MUTEX_HELD(&spa_namespace_lock));
2150 2150  
2151 2151          spa_start_indirect_condensing_thread(spa);
2152 2152  
2153 2153          ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL);
2154 2154          spa->spa_checkpoint_discard_zthr =
2155 2155              zthr_create(spa_checkpoint_discard_thread_check,
2156 2156              spa_checkpoint_discard_thread, spa);
2157 2157  }
2158 2158  
2159 2159  /*
2160 2160   * Fix up config after a partly-completed split.  This is done with the
2161 2161   * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
2162 2162   * pool have that entry in their config, but only the splitting one contains
2163 2163   * a list of all the guids of the vdevs that are being split off.
2164 2164   *
2165 2165   * This function determines what to do with that list: either rejoin
2166 2166   * all the disks to the pool, or complete the splitting process.  To attempt
2167 2167   * the rejoin, each disk that is offlined is marked online again, and
2168 2168   * we do a reopen() call.  If the vdev label for every disk that was
2169 2169   * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
2170 2170   * then we call vdev_split() on each disk, and complete the split.
2171 2171   *
2172 2172   * Otherwise we leave the config alone, with all the vdevs in place in
2173 2173   * the original pool.
2174 2174   */
2175 2175  static void
2176 2176  spa_try_repair(spa_t *spa, nvlist_t *config)
2177 2177  {
2178 2178          uint_t extracted;
2179 2179          uint64_t *glist;
2180 2180          uint_t i, gcount;
2181 2181          nvlist_t *nvl;
2182 2182          vdev_t **vd;
2183 2183          boolean_t attempt_reopen;
2184 2184  
2185 2185          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
2186 2186                  return;
2187 2187  
2188 2188          /* check that the config is complete */
2189 2189          if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
2190 2190              &glist, &gcount) != 0)
2191 2191                  return;
2192 2192  
2193 2193          vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
2194 2194  
2195 2195          /* attempt to online all the vdevs & validate */
2196 2196          attempt_reopen = B_TRUE;
2197 2197          for (i = 0; i < gcount; i++) {
2198 2198                  if (glist[i] == 0)      /* vdev is hole */
2199 2199                          continue;
2200 2200  
2201 2201                  vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
2202 2202                  if (vd[i] == NULL) {
2203 2203                          /*
2204 2204                           * Don't bother attempting to reopen the disks;
2205 2205                           * just do the split.
2206 2206                           */
2207 2207                          attempt_reopen = B_FALSE;
2208 2208                  } else {
2209 2209                          /* attempt to re-online it */
2210 2210                          vd[i]->vdev_offline = B_FALSE;
2211 2211                  }
2212 2212          }
2213 2213  
2214 2214          if (attempt_reopen) {
2215 2215                  vdev_reopen(spa->spa_root_vdev);
2216 2216  
2217 2217                  /* check each device to see what state it's in */
2218 2218                  for (extracted = 0, i = 0; i < gcount; i++) {
2219 2219                          if (vd[i] != NULL &&
2220 2220                              vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
2221 2221                                  break;
2222 2222                          ++extracted;
2223 2223                  }
2224 2224          }
2225 2225  
2226 2226          /*
2227 2227           * If every disk has been moved to the new pool, or if we never
2228 2228           * even attempted to look at them, then we split them off for
2229 2229           * good.
2230 2230           */
2231 2231          if (!attempt_reopen || gcount == extracted) {
2232 2232                  for (i = 0; i < gcount; i++)
2233 2233                          if (vd[i] != NULL)
2234 2234                                  vdev_split(vd[i]);
2235 2235                  vdev_reopen(spa->spa_root_vdev);
2236 2236          }
2237 2237  
2238 2238          kmem_free(vd, gcount * sizeof (vdev_t *));
2239 2239  }
2240 2240  
2241 2241  static int
2242 2242  spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
2243 2243  {
2244 2244          char *ereport = FM_EREPORT_ZFS_POOL;
2245 2245          int error;
2246 2246  
2247 2247          spa->spa_load_state = state;
2248 2248  
2249 2249          gethrestime(&spa->spa_loaded_ts);
2250 2250          error = spa_load_impl(spa, type, &ereport);
2251 2251  
2252 2252          /*
2253 2253           * Don't count references from objsets that are already closed
2254 2254           * and are making their way through the eviction process.
2255 2255           */
2256 2256          spa_evicting_os_wait(spa);
2257 2257          spa->spa_minref = refcount_count(&spa->spa_refcount);
2258 2258          if (error) {
2259 2259                  if (error != EEXIST) {
2260 2260                          spa->spa_loaded_ts.tv_sec = 0;
2261 2261                          spa->spa_loaded_ts.tv_nsec = 0;
2262 2262                  }
2263 2263                  if (error != EBADF) {
2264 2264                          zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
2265 2265                  }
2266 2266          }
2267 2267          spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
2268 2268          spa->spa_ena = 0;
2269 2269  
2270 2270          return (error);
2271 2271  }
2272 2272  
2273 2273  /*
2274 2274   * Count the number of per-vdev ZAPs associated with all of the vdevs in the
2275 2275   * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
2276 2276   * spa's per-vdev ZAP list.
2277 2277   */
2278 2278  static uint64_t
2279 2279  vdev_count_verify_zaps(vdev_t *vd)
2280 2280  {
2281 2281          spa_t *spa = vd->vdev_spa;
2282 2282          uint64_t total = 0;
2283 2283          if (vd->vdev_top_zap != 0) {
2284 2284                  total++;
2285 2285                  ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2286 2286                      spa->spa_all_vdev_zaps, vd->vdev_top_zap));
2287 2287          }
2288 2288          if (vd->vdev_leaf_zap != 0) {
2289 2289                  total++;
2290 2290                  ASSERT0(zap_lookup_int(spa->spa_meta_objset,
2291 2291                      spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
2292 2292          }
2293 2293  
2294 2294          for (uint64_t i = 0; i < vd->vdev_children; i++) {
2295 2295                  total += vdev_count_verify_zaps(vd->vdev_child[i]);
2296 2296          }
2297 2297  
2298 2298          return (total);
2299 2299  }
2300 2300  
2301 2301  static int
2302 2302  spa_verify_host(spa_t *spa, nvlist_t *mos_config)
2303 2303  {
2304 2304          uint64_t hostid;
2305 2305          char *hostname;
2306 2306          uint64_t myhostid = 0;
2307 2307  
2308 2308          if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
2309 2309              ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2310 2310                  hostname = fnvlist_lookup_string(mos_config,
2311 2311                      ZPOOL_CONFIG_HOSTNAME);
2312 2312  
2313 2313                  myhostid = zone_get_hostid(NULL);
2314 2314  
2315 2315                  if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
2316 2316                          cmn_err(CE_WARN, "pool '%s' could not be "
2317 2317                              "loaded as it was last accessed by "
2318 2318                              "another system (host: %s hostid: 0x%llx). "
2319 2319                              "See: http://illumos.org/msg/ZFS-8000-EY",
2320 2320                              spa_name(spa), hostname, (u_longlong_t)hostid);
2321 2321                          spa_load_failed(spa, "hostid verification failed: pool "
2322 2322                              "last accessed by host: %s (hostid: 0x%llx)",
2323 2323                              hostname, (u_longlong_t)hostid);
2324 2324                          return (SET_ERROR(EBADF));
2325 2325                  }
2326 2326          }
2327 2327  
2328 2328          return (0);
2329 2329  }
2330 2330  
2331 2331  static int
2332 2332  spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
2333 2333  {
2334 2334          int error = 0;
2335 2335          nvlist_t *nvtree, *nvl, *config = spa->spa_config;
2336 2336          int parse;
2337 2337          vdev_t *rvd;
2338 2338          uint64_t pool_guid;
2339 2339          char *comment;
2340 2340  
2341 2341          /*
2342 2342           * Versioning wasn't explicitly added to the label until later, so if
2343 2343           * it's not present treat it as the initial version.
2344 2344           */
2345 2345          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
2346 2346              &spa->spa_ubsync.ub_version) != 0)
2347 2347                  spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
2348 2348  
2349 2349          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
2350 2350                  spa_load_failed(spa, "invalid config provided: '%s' missing",
2351 2351                      ZPOOL_CONFIG_POOL_GUID);
2352 2352                  return (SET_ERROR(EINVAL));
2353 2353          }
2354 2354  
2355 2355          /*
2356 2356           * If we are doing an import, ensure that the pool is not already
2357 2357           * imported by checking if its pool guid already exists in the
2358 2358           * spa namespace.
2359 2359           *
2360 2360           * The only case that we allow an already imported pool to be
2361 2361           * imported again, is when the pool is checkpointed and we want to
2362 2362           * look at its checkpointed state from userland tools like zdb.
2363 2363           */
2364 2364  #ifdef _KERNEL
2365 2365          if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2366 2366              spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2367 2367              spa_guid_exists(pool_guid, 0)) {
2368 2368  #else
2369 2369          if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
2370 2370              spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
2371 2371              spa_guid_exists(pool_guid, 0) &&
2372 2372              !spa_importing_readonly_checkpoint(spa)) {
2373 2373  #endif
2374 2374                  spa_load_failed(spa, "a pool with guid %llu is already open",
2375 2375                      (u_longlong_t)pool_guid);
2376 2376                  return (SET_ERROR(EEXIST));
2377 2377          }
2378 2378  
2379 2379          spa->spa_config_guid = pool_guid;
2380 2380  
2381 2381          nvlist_free(spa->spa_load_info);
2382 2382          spa->spa_load_info = fnvlist_alloc();
2383 2383  
2384 2384          ASSERT(spa->spa_comment == NULL);
2385 2385          if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
2386 2386                  spa->spa_comment = spa_strdup(comment);
2387 2387  
2388 2388          (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
2389 2389              &spa->spa_config_txg);
2390 2390  
2391 2391          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
2392 2392                  spa->spa_config_splitting = fnvlist_dup(nvl);
2393 2393  
2394 2394          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
2395 2395                  spa_load_failed(spa, "invalid config provided: '%s' missing",
2396 2396                      ZPOOL_CONFIG_VDEV_TREE);
2397 2397                  return (SET_ERROR(EINVAL));
2398 2398          }
2399 2399  
2400 2400          /*
2401 2401           * Create "The Godfather" zio to hold all async IOs
2402 2402           */
2403 2403          spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
2404 2404              KM_SLEEP);
2405 2405          for (int i = 0; i < max_ncpus; i++) {
2406 2406                  spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
2407 2407                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
2408 2408                      ZIO_FLAG_GODFATHER);
2409 2409          }
2410 2410  
2411 2411          /*
2412 2412           * Parse the configuration into a vdev tree.  We explicitly set the
2413 2413           * value that will be returned by spa_version() since parsing the
2414 2414           * configuration requires knowing the version number.
2415 2415           */
2416 2416          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2417 2417          parse = (type == SPA_IMPORT_EXISTING ?
2418 2418              VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
2419 2419          error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
2420 2420          spa_config_exit(spa, SCL_ALL, FTAG);
2421 2421  
2422 2422          if (error != 0) {
2423 2423                  spa_load_failed(spa, "unable to parse config [error=%d]",
2424 2424                      error);
2425 2425                  return (error);
2426 2426          }
2427 2427  
2428 2428          ASSERT(spa->spa_root_vdev == rvd);
2429 2429          ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
2430 2430          ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
2431 2431  
2432 2432          if (type != SPA_IMPORT_ASSEMBLE) {
2433 2433                  ASSERT(spa_guid(spa) == pool_guid);
2434 2434          }
2435 2435  
2436 2436          return (0);
2437 2437  }
2438 2438  
2439 2439  /*
2440 2440   * Recursively open all vdevs in the vdev tree. This function is called twice:
2441 2441   * first with the untrusted config, then with the trusted config.
2442 2442   */
2443 2443  static int
2444 2444  spa_ld_open_vdevs(spa_t *spa)
2445 2445  {
2446 2446          int error = 0;
2447 2447  
2448 2448          /*
2449 2449           * spa_missing_tvds_allowed defines how many top-level vdevs can be
2450 2450           * missing/unopenable for the root vdev to be still considered openable.
2451 2451           */
2452 2452          if (spa->spa_trust_config) {
2453 2453                  spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
2454 2454          } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
2455 2455                  spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
2456 2456          } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
2457 2457                  spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
2458 2458          } else {
2459 2459                  spa->spa_missing_tvds_allowed = 0;
2460 2460          }
2461 2461  
2462 2462          spa->spa_missing_tvds_allowed =
2463 2463              MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
2464 2464  
2465 2465          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2466 2466          error = vdev_open(spa->spa_root_vdev);
2467 2467          spa_config_exit(spa, SCL_ALL, FTAG);
2468 2468  
2469 2469          if (spa->spa_missing_tvds != 0) {
2470 2470                  spa_load_note(spa, "vdev tree has %lld missing top-level "
2471 2471                      "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
2472 2472                  if (spa->spa_trust_config && (spa->spa_mode & FWRITE)) {
2473 2473                          /*
2474 2474                           * Although theoretically we could allow users to open
2475 2475                           * incomplete pools in RW mode, we'd need to add a lot
2476 2476                           * of extra logic (e.g. adjust pool space to account
2477 2477                           * for missing vdevs).
2478 2478                           * This limitation also prevents users from accidentally
2479 2479                           * opening the pool in RW mode during data recovery and
2480 2480                           * damaging it further.
2481 2481                           */
2482 2482                          spa_load_note(spa, "pools with missing top-level "
2483 2483                              "vdevs can only be opened in read-only mode.");
2484 2484                          error = SET_ERROR(ENXIO);
2485 2485                  } else {
2486 2486                          spa_load_note(spa, "current settings allow for maximum "
2487 2487                              "%lld missing top-level vdevs at this stage.",
2488 2488                              (u_longlong_t)spa->spa_missing_tvds_allowed);
2489 2489                  }
2490 2490          }
2491 2491          if (error != 0) {
2492 2492                  spa_load_failed(spa, "unable to open vdev tree [error=%d]",
2493 2493                      error);
2494 2494          }
2495 2495          if (spa->spa_missing_tvds != 0 || error != 0)
2496 2496                  vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
2497 2497  
2498 2498          return (error);
2499 2499  }
2500 2500  
2501 2501  /*
2502 2502   * We need to validate the vdev labels against the configuration that
2503 2503   * we have in hand. This function is called twice: first with an untrusted
2504 2504   * config, then with a trusted config. The validation is more strict when the
2505 2505   * config is trusted.
2506 2506   */
2507 2507  static int
2508 2508  spa_ld_validate_vdevs(spa_t *spa)
2509 2509  {
2510 2510          int error = 0;
2511 2511          vdev_t *rvd = spa->spa_root_vdev;
2512 2512  
2513 2513          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2514 2514          error = vdev_validate(rvd);
2515 2515          spa_config_exit(spa, SCL_ALL, FTAG);
2516 2516  
2517 2517          if (error != 0) {
2518 2518                  spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
2519 2519                  return (error);
2520 2520          }
2521 2521  
2522 2522          if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
2523 2523                  spa_load_failed(spa, "cannot open vdev tree after invalidating "
2524 2524                      "some vdevs");
2525 2525                  vdev_dbgmsg_print_tree(rvd, 2);
2526 2526                  return (SET_ERROR(ENXIO));
2527 2527          }
2528 2528  
2529 2529          return (0);
2530 2530  }
2531 2531  
2532 2532  static void
2533 2533  spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
2534 2534  {
2535 2535          spa->spa_state = POOL_STATE_ACTIVE;
2536 2536          spa->spa_ubsync = spa->spa_uberblock;
2537 2537          spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
2538 2538              TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
2539 2539          spa->spa_first_txg = spa->spa_last_ubsync_txg ?
2540 2540              spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2541 2541          spa->spa_claim_max_txg = spa->spa_first_txg;
2542 2542          spa->spa_prev_software_version = ub->ub_software_version;
2543 2543  }
2544 2544  
2545 2545  static int
2546 2546  spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
2547 2547  {
2548 2548          vdev_t *rvd = spa->spa_root_vdev;
2549 2549          nvlist_t *label;
2550 2550          uberblock_t *ub = &spa->spa_uberblock;
2551 2551  
2552 2552          /*
2553 2553           * If we are opening the checkpointed state of the pool by
2554 2554           * rewinding to it, at this point we will have written the
2555 2555           * checkpointed uberblock to the vdev labels, so searching
2556 2556           * the labels will find the right uberblock.  However, if
2557 2557           * we are opening the checkpointed state read-only, we have
2558 2558           * not modified the labels. Therefore, we must ignore the
2559 2559           * labels and continue using the spa_uberblock that was set
2560 2560           * by spa_ld_checkpoint_rewind.
2561 2561           *
2562 2562           * Note that it would be fine to ignore the labels when
2563 2563           * rewinding (opening writeable) as well. However, if we
2564 2564           * crash just after writing the labels, we will end up
2565 2565           * searching the labels. Doing so in the common case means
2566 2566           * that this code path gets exercised normally, rather than
2567 2567           * just in the edge case.
2568 2568           */
2569 2569          if (ub->ub_checkpoint_txg != 0 &&
2570 2570              spa_importing_readonly_checkpoint(spa)) {
2571 2571                  spa_ld_select_uberblock_done(spa, ub);
2572 2572                  return (0);
2573 2573          }
2574 2574  
2575 2575          /*
2576 2576           * Find the best uberblock.
2577 2577           */
2578 2578          vdev_uberblock_load(rvd, ub, &label);
2579 2579  
2580 2580          /*
2581 2581           * If we weren't able to find a single valid uberblock, return failure.
2582 2582           */
2583 2583          if (ub->ub_txg == 0) {
2584 2584                  nvlist_free(label);
2585 2585                  spa_load_failed(spa, "no valid uberblock found");
2586 2586                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
2587 2587          }
2588 2588  
2589 2589          spa_load_note(spa, "using uberblock with txg=%llu",
2590 2590              (u_longlong_t)ub->ub_txg);
2591 2591  
2592 2592          /*
2593 2593           * If the pool has an unsupported version we can't open it.
2594 2594           */
2595 2595          if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
2596 2596                  nvlist_free(label);
2597 2597                  spa_load_failed(spa, "version %llu is not supported",
2598 2598                      (u_longlong_t)ub->ub_version);
2599 2599                  return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
2600 2600          }
2601 2601  
2602 2602          if (ub->ub_version >= SPA_VERSION_FEATURES) {
2603 2603                  nvlist_t *features;
2604 2604  
2605 2605                  /*
2606 2606                   * If we weren't able to find what's necessary for reading the
2607 2607                   * MOS in the label, return failure.
2608 2608                   */
2609 2609                  if (label == NULL) {
2610 2610                          spa_load_failed(spa, "label config unavailable");
2611 2611                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2612 2612                              ENXIO));
2613 2613                  }
2614 2614  
2615 2615                  if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
2616 2616                      &features) != 0) {
2617 2617                          nvlist_free(label);
2618 2618                          spa_load_failed(spa, "invalid label: '%s' missing",
2619 2619                              ZPOOL_CONFIG_FEATURES_FOR_READ);
2620 2620                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2621 2621                              ENXIO));
2622 2622                  }
2623 2623  
2624 2624                  /*
2625 2625                   * Update our in-core representation with the definitive values
2626 2626                   * from the label.
2627 2627                   */
2628 2628                  nvlist_free(spa->spa_label_features);
2629 2629                  VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
2630 2630          }
2631 2631  
2632 2632          nvlist_free(label);
2633 2633  
2634 2634          /*
2635 2635           * Look through entries in the label nvlist's features_for_read. If
2636 2636           * there is a feature listed there which we don't understand then we
2637 2637           * cannot open a pool.
2638 2638           */
2639 2639          if (ub->ub_version >= SPA_VERSION_FEATURES) {
2640 2640                  nvlist_t *unsup_feat;
2641 2641  
2642 2642                  VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
2643 2643                      0);
2644 2644  
2645 2645                  for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
2646 2646                      NULL); nvp != NULL;
2647 2647                      nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
2648 2648                          if (!zfeature_is_supported(nvpair_name(nvp))) {
2649 2649                                  VERIFY(nvlist_add_string(unsup_feat,
2650 2650                                      nvpair_name(nvp), "") == 0);
2651 2651                          }
2652 2652                  }
2653 2653  
2654 2654                  if (!nvlist_empty(unsup_feat)) {
2655 2655                          VERIFY(nvlist_add_nvlist(spa->spa_load_info,
2656 2656                              ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
2657 2657                          nvlist_free(unsup_feat);
2658 2658                          spa_load_failed(spa, "some features are unsupported");
2659 2659                          return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2660 2660                              ENOTSUP));
2661 2661                  }
2662 2662  
2663 2663                  nvlist_free(unsup_feat);
2664 2664          }
2665 2665  
2666 2666          if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
2667 2667                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2668 2668                  spa_try_repair(spa, spa->spa_config);
2669 2669                  spa_config_exit(spa, SCL_ALL, FTAG);
2670 2670                  nvlist_free(spa->spa_config_splitting);
2671 2671                  spa->spa_config_splitting = NULL;
2672 2672          }
2673 2673  
2674 2674          /*
2675 2675           * Initialize internal SPA structures.
2676 2676           */
2677 2677          spa_ld_select_uberblock_done(spa, ub);
2678 2678  
2679 2679          return (0);
2680 2680  }
2681 2681  
2682 2682  static int
2683 2683  spa_ld_open_rootbp(spa_t *spa)
2684 2684  {
2685 2685          int error = 0;
2686 2686          vdev_t *rvd = spa->spa_root_vdev;
2687 2687  
2688 2688          error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2689 2689          if (error != 0) {
2690 2690                  spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
2691 2691                      "[error=%d]", error);
2692 2692                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2693 2693          }
2694 2694          spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2695 2695  
2696 2696          return (0);
2697 2697  }
2698 2698  
2699 2699  static int
2700 2700  spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
2701 2701      boolean_t reloading)
2702 2702  {
2703 2703          vdev_t *mrvd, *rvd = spa->spa_root_vdev;
2704 2704          nvlist_t *nv, *mos_config, *policy;
2705 2705          int error = 0, copy_error;
2706 2706          uint64_t healthy_tvds, healthy_tvds_mos;
2707 2707          uint64_t mos_config_txg;
2708 2708  
2709 2709          if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
2710 2710              != 0)
2711 2711                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2712 2712  
2713 2713          /*
2714 2714           * If we're assembling a pool from a split, the config provided is
2715 2715           * already trusted so there is nothing to do.
2716 2716           */
2717 2717          if (type == SPA_IMPORT_ASSEMBLE)
2718 2718                  return (0);
2719 2719  
2720 2720          healthy_tvds = spa_healthy_core_tvds(spa);
2721 2721  
2722 2722          if (load_nvlist(spa, spa->spa_config_object, &mos_config)
2723 2723              != 0) {
2724 2724                  spa_load_failed(spa, "unable to retrieve MOS config");
2725 2725                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2726 2726          }
2727 2727  
2728 2728          /*
2729 2729           * If we are doing an open, pool owner wasn't verified yet, thus do
2730 2730           * the verification here.
2731 2731           */
2732 2732          if (spa->spa_load_state == SPA_LOAD_OPEN) {
2733 2733                  error = spa_verify_host(spa, mos_config);
2734 2734                  if (error != 0) {
2735 2735                          nvlist_free(mos_config);
2736 2736                          return (error);
2737 2737                  }
2738 2738          }
2739 2739  
2740 2740          nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
2741 2741  
2742 2742          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2743 2743  
2744 2744          /*
2745 2745           * Build a new vdev tree from the trusted config
2746 2746           */
2747 2747          VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
2748 2748  
2749 2749          /*
2750 2750           * Vdev paths in the MOS may be obsolete. If the untrusted config was
2751 2751           * obtained by scanning /dev/dsk, then it will have the right vdev
2752 2752           * paths. We update the trusted MOS config with this information.
2753 2753           * We first try to copy the paths with vdev_copy_path_strict, which
2754 2754           * succeeds only when both configs have exactly the same vdev tree.
2755 2755           * If that fails, we fall back to a more flexible method that has a
2756 2756           * best effort policy.
2757 2757           */
2758 2758          copy_error = vdev_copy_path_strict(rvd, mrvd);
2759 2759          if (copy_error != 0 || spa_load_print_vdev_tree) {
2760 2760                  spa_load_note(spa, "provided vdev tree:");
2761 2761                  vdev_dbgmsg_print_tree(rvd, 2);
2762 2762                  spa_load_note(spa, "MOS vdev tree:");
2763 2763                  vdev_dbgmsg_print_tree(mrvd, 2);
2764 2764          }
2765 2765          if (copy_error != 0) {
2766 2766                  spa_load_note(spa, "vdev_copy_path_strict failed, falling "
2767 2767                      "back to vdev_copy_path_relaxed");
2768 2768                  vdev_copy_path_relaxed(rvd, mrvd);
2769 2769          }
2770 2770  
2771 2771          vdev_close(rvd);
2772 2772          vdev_free(rvd);
2773 2773          spa->spa_root_vdev = mrvd;
2774 2774          rvd = mrvd;
2775 2775          spa_config_exit(spa, SCL_ALL, FTAG);
2776 2776  
2777 2777          /*
2778 2778           * We will use spa_config if we decide to reload the spa or if spa_load
2779 2779           * fails and we rewind. We must thus regenerate the config using the
2780 2780           * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
2781 2781           * pass settings on how to load the pool and is not stored in the MOS.
2782 2782           * We copy it over to our new, trusted config.
2783 2783           */
2784 2784          mos_config_txg = fnvlist_lookup_uint64(mos_config,
2785 2785              ZPOOL_CONFIG_POOL_TXG);
2786 2786          nvlist_free(mos_config);
2787 2787          mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
2788 2788          if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
2789 2789              &policy) == 0)
2790 2790                  fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
2791 2791          spa_config_set(spa, mos_config);
2792 2792          spa->spa_config_source = SPA_CONFIG_SRC_MOS;
2793 2793  
2794 2794          /*
2795 2795           * Now that we got the config from the MOS, we should be more strict
2796 2796           * in checking blkptrs and can make assumptions about the consistency
2797 2797           * of the vdev tree. spa_trust_config must be set to true before opening
2798 2798           * vdevs in order for them to be writeable.
2799 2799           */
2800 2800          spa->spa_trust_config = B_TRUE;
2801 2801  
2802 2802          /*
2803 2803           * Open and validate the new vdev tree
2804 2804           */
2805 2805          error = spa_ld_open_vdevs(spa);
2806 2806          if (error != 0)
2807 2807                  return (error);
2808 2808  
2809 2809          error = spa_ld_validate_vdevs(spa);
2810 2810          if (error != 0)
2811 2811                  return (error);
2812 2812  
2813 2813          if (copy_error != 0 || spa_load_print_vdev_tree) {
2814 2814                  spa_load_note(spa, "final vdev tree:");
2815 2815                  vdev_dbgmsg_print_tree(rvd, 2);
2816 2816          }
2817 2817  
2818 2818          if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
2819 2819              !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
2820 2820                  /*
2821 2821                   * Sanity check to make sure that we are indeed loading the
2822 2822                   * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
2823 2823                   * in the config provided and they happened to be the only ones
2824 2824                   * to have the latest uberblock, we could involuntarily perform
2825 2825                   * an extreme rewind.
2826 2826                   */
2827 2827                  healthy_tvds_mos = spa_healthy_core_tvds(spa);
2828 2828                  if (healthy_tvds_mos - healthy_tvds >=
2829 2829                      SPA_SYNC_MIN_VDEVS) {
2830 2830                          spa_load_note(spa, "config provided misses too many "
2831 2831                              "top-level vdevs compared to MOS (%lld vs %lld). ",
2832 2832                              (u_longlong_t)healthy_tvds,
2833 2833                              (u_longlong_t)healthy_tvds_mos);
2834 2834                          spa_load_note(spa, "vdev tree:");
2835 2835                          vdev_dbgmsg_print_tree(rvd, 2);
2836 2836                          if (reloading) {
2837 2837                                  spa_load_failed(spa, "config was already "
2838 2838                                      "provided from MOS. Aborting.");
2839 2839                                  return (spa_vdev_err(rvd,
2840 2840                                      VDEV_AUX_CORRUPT_DATA, EIO));
2841 2841                          }
2842 2842                          spa_load_note(spa, "spa must be reloaded using MOS "
2843 2843                              "config");
2844 2844                          return (SET_ERROR(EAGAIN));
2845 2845                  }
2846 2846          }
2847 2847  
2848 2848          error = spa_check_for_missing_logs(spa);
2849 2849          if (error != 0)
2850 2850                  return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
2851 2851  
2852 2852          if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
2853 2853                  spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
2854 2854                      "guid sum (%llu != %llu)",
2855 2855                      (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
2856 2856                      (u_longlong_t)rvd->vdev_guid_sum);
2857 2857                  return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2858 2858                      ENXIO));
2859 2859          }
2860 2860  
2861 2861          return (0);
2862 2862  }
2863 2863  
2864 2864  static int
2865 2865  spa_ld_open_indirect_vdev_metadata(spa_t *spa)
2866 2866  {
2867 2867          int error = 0;
2868 2868          vdev_t *rvd = spa->spa_root_vdev;
2869 2869  
2870 2870          /*
2871 2871           * Everything that we read before spa_remove_init() must be stored
2872 2872           * on concreted vdevs.  Therefore we do this as early as possible.
2873 2873           */
2874 2874          error = spa_remove_init(spa);
2875 2875          if (error != 0) {
2876 2876                  spa_load_failed(spa, "spa_remove_init failed [error=%d]",
2877 2877                      error);
2878 2878                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2879 2879          }
2880 2880  
2881 2881          /*
2882 2882           * Retrieve information needed to condense indirect vdev mappings.
2883 2883           */
2884 2884          error = spa_condense_init(spa);
2885 2885          if (error != 0) {
2886 2886                  spa_load_failed(spa, "spa_condense_init failed [error=%d]",
2887 2887                      error);
2888 2888                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
2889 2889          }
2890 2890  
2891 2891          return (0);
2892 2892  }
2893 2893  
2894 2894  static int
2895 2895  spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
2896 2896  {
2897 2897          int error = 0;
2898 2898          vdev_t *rvd = spa->spa_root_vdev;
2899 2899  
2900 2900          if (spa_version(spa) >= SPA_VERSION_FEATURES) {
2901 2901                  boolean_t missing_feat_read = B_FALSE;
2902 2902                  nvlist_t *unsup_feat, *enabled_feat;
2903 2903  
2904 2904                  if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
2905 2905                      &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
2906 2906                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2907 2907                  }
2908 2908  
2909 2909                  if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
2910 2910                      &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
2911 2911                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2912 2912                  }
2913 2913  
2914 2914                  if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
2915 2915                      &spa->spa_feat_desc_obj, B_TRUE) != 0) {
2916 2916                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2917 2917                  }
2918 2918  
2919 2919                  enabled_feat = fnvlist_alloc();
2920 2920                  unsup_feat = fnvlist_alloc();
2921 2921  
2922 2922                  if (!spa_features_check(spa, B_FALSE,
2923 2923                      unsup_feat, enabled_feat))
2924 2924                          missing_feat_read = B_TRUE;
2925 2925  
2926 2926                  if (spa_writeable(spa) ||
2927 2927                      spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
2928 2928                          if (!spa_features_check(spa, B_TRUE,
2929 2929                              unsup_feat, enabled_feat)) {
2930 2930                                  *missing_feat_writep = B_TRUE;
2931 2931                          }
2932 2932                  }
2933 2933  
2934 2934                  fnvlist_add_nvlist(spa->spa_load_info,
2935 2935                      ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
2936 2936  
2937 2937                  if (!nvlist_empty(unsup_feat)) {
2938 2938                          fnvlist_add_nvlist(spa->spa_load_info,
2939 2939                              ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
2940 2940                  }
2941 2941  
2942 2942                  fnvlist_free(enabled_feat);
2943 2943                  fnvlist_free(unsup_feat);
2944 2944  
2945 2945                  if (!missing_feat_read) {
2946 2946                          fnvlist_add_boolean(spa->spa_load_info,
2947 2947                              ZPOOL_CONFIG_CAN_RDONLY);
2948 2948                  }
2949 2949  
2950 2950                  /*
2951 2951                   * If the state is SPA_LOAD_TRYIMPORT, our objective is
2952 2952                   * twofold: to determine whether the pool is available for
2953 2953                   * import in read-write mode and (if it is not) whether the
2954 2954                   * pool is available for import in read-only mode. If the pool
2955 2955                   * is available for import in read-write mode, it is displayed
2956 2956                   * as available in userland; if it is not available for import
2957 2957                   * in read-only mode, it is displayed as unavailable in
2958 2958                   * userland. If the pool is available for import in read-only
2959 2959                   * mode but not read-write mode, it is displayed as unavailable
2960 2960                   * in userland with a special note that the pool is actually
2961 2961                   * available for open in read-only mode.
2962 2962                   *
2963 2963                   * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
2964 2964                   * missing a feature for write, we must first determine whether
2965 2965                   * the pool can be opened read-only before returning to
2966 2966                   * userland in order to know whether to display the
2967 2967                   * abovementioned note.
2968 2968                   */
2969 2969                  if (missing_feat_read || (*missing_feat_writep &&
2970 2970                      spa_writeable(spa))) {
2971 2971                          spa_load_failed(spa, "pool uses unsupported features");
2972 2972                          return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
2973 2973                              ENOTSUP));
2974 2974                  }
2975 2975  
2976 2976                  /*
2977 2977                   * Load refcounts for ZFS features from disk into an in-memory
2978 2978                   * cache during SPA initialization.
2979 2979                   */
2980 2980                  for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
2981 2981                          uint64_t refcount;
2982 2982  
2983 2983                          error = feature_get_refcount_from_disk(spa,
2984 2984                              &spa_feature_table[i], &refcount);
2985 2985                          if (error == 0) {
2986 2986                                  spa->spa_feat_refcount_cache[i] = refcount;
2987 2987                          } else if (error == ENOTSUP) {
2988 2988                                  spa->spa_feat_refcount_cache[i] =
2989 2989                                      SPA_FEATURE_DISABLED;
2990 2990                          } else {
2991 2991                                  spa_load_failed(spa, "error getting refcount "
2992 2992                                      "for feature %s [error=%d]",
2993 2993                                      spa_feature_table[i].fi_guid, error);
2994 2994                                  return (spa_vdev_err(rvd,
2995 2995                                      VDEV_AUX_CORRUPT_DATA, EIO));
2996 2996                          }
2997 2997                  }
2998 2998          }
2999 2999  
3000 3000          if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
3001 3001                  if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
3002 3002                      &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
3003 3003                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3004 3004          }
3005 3005  
3006 3006          return (0);
3007 3007  }
3008 3008  
3009 3009  static int
3010 3010  spa_ld_load_special_directories(spa_t *spa)
3011 3011  {
3012 3012          int error = 0;
3013 3013          vdev_t *rvd = spa->spa_root_vdev;
3014 3014  
3015 3015          spa->spa_is_initializing = B_TRUE;
3016 3016          error = dsl_pool_open(spa->spa_dsl_pool);
3017 3017          spa->spa_is_initializing = B_FALSE;
3018 3018          if (error != 0) {
3019 3019                  spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
3020 3020                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3021 3021          }
3022 3022  
3023 3023          return (0);
3024 3024  }
3025 3025  
3026 3026  static int
3027 3027  spa_ld_get_props(spa_t *spa)
3028 3028  {
3029 3029          int error = 0;
3030 3030          uint64_t obj;
3031 3031          vdev_t *rvd = spa->spa_root_vdev;
3032 3032  
3033 3033          /* Grab the secret checksum salt from the MOS. */
3034 3034          error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3035 3035              DMU_POOL_CHECKSUM_SALT, 1,
3036 3036              sizeof (spa->spa_cksum_salt.zcs_bytes),
3037 3037              spa->spa_cksum_salt.zcs_bytes);
3038 3038          if (error == ENOENT) {
3039 3039                  /* Generate a new salt for subsequent use */
3040 3040                  (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
3041 3041                      sizeof (spa->spa_cksum_salt.zcs_bytes));
3042 3042          } else if (error != 0) {
3043 3043                  spa_load_failed(spa, "unable to retrieve checksum salt from "
3044 3044                      "MOS [error=%d]", error);
3045 3045                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3046 3046          }
3047 3047  
3048 3048          if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
3049 3049                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3050 3050          error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
3051 3051          if (error != 0) {
3052 3052                  spa_load_failed(spa, "error opening deferred-frees bpobj "
3053 3053                      "[error=%d]", error);
3054 3054                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3055 3055          }
3056 3056  
3057 3057          /*
3058 3058           * Load the bit that tells us to use the new accounting function
3059 3059           * (raid-z deflation).  If we have an older pool, this will not
3060 3060           * be present.
3061 3061           */
3062 3062          error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
3063 3063          if (error != 0 && error != ENOENT)
3064 3064                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3065 3065  
3066 3066          error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
3067 3067              &spa->spa_creation_version, B_FALSE);
3068 3068          if (error != 0 && error != ENOENT)
3069 3069                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3070 3070  
3071 3071          /*
3072 3072           * Load the persistent error log.  If we have an older pool, this will
3073 3073           * not be present.
3074 3074           */
3075 3075          error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
3076 3076              B_FALSE);
3077 3077          if (error != 0 && error != ENOENT)
3078 3078                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3079 3079  
3080 3080          error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
3081 3081              &spa->spa_errlog_scrub, B_FALSE);
3082 3082          if (error != 0 && error != ENOENT)
3083 3083                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3084 3084  
3085 3085          /*
3086 3086           * Load the history object.  If we have an older pool, this
3087 3087           * will not be present.
3088 3088           */
3089 3089          error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
3090 3090          if (error != 0 && error != ENOENT)
3091 3091                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3092 3092  
3093 3093          /*
3094 3094           * Load the per-vdev ZAP map. If we have an older pool, this will not
3095 3095           * be present; in this case, defer its creation to a later time to
3096 3096           * avoid dirtying the MOS this early / out of sync context. See
3097 3097           * spa_sync_config_object.
3098 3098           */
3099 3099  
3100 3100          /* The sentinel is only available in the MOS config. */
3101 3101          nvlist_t *mos_config;
3102 3102          if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
3103 3103                  spa_load_failed(spa, "unable to retrieve MOS config");
3104 3104                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3105 3105          }
3106 3106  
3107 3107          error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
3108 3108              &spa->spa_all_vdev_zaps, B_FALSE);
3109 3109  
3110 3110          if (error == ENOENT) {
3111 3111                  VERIFY(!nvlist_exists(mos_config,
3112 3112                      ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
3113 3113                  spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
3114 3114                  ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3115 3115          } else if (error != 0) {
3116 3116                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3117 3117          } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
3118 3118                  /*
3119 3119                   * An older version of ZFS overwrote the sentinel value, so
3120 3120                   * we have orphaned per-vdev ZAPs in the MOS. Defer their
3121 3121                   * destruction to later; see spa_sync_config_object.
3122 3122                   */
3123 3123                  spa->spa_avz_action = AVZ_ACTION_DESTROY;
3124 3124                  /*
3125 3125                   * We're assuming that no vdevs have had their ZAPs created
3126 3126                   * before this. Better be sure of it.
3127 3127                   */
3128 3128                  ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
3129 3129          }
3130 3130          nvlist_free(mos_config);
3131 3131  
3132 3132          spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3133 3133  
3134 3134          error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
3135 3135              B_FALSE);
3136 3136          if (error && error != ENOENT)
3137 3137                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3138 3138  
3139 3139          if (error == 0) {
3140 3140                  uint64_t autoreplace;
3141 3141  
3142 3142                  spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
3143 3143                  spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
3144 3144                  spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
3145 3145                  spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
3146 3146                  spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
3147 3147                  spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
3148 3148                      &spa->spa_dedup_ditto);
3149 3149  
3150 3150                  spa->spa_autoreplace = (autoreplace != 0);
3151 3151          }
3152 3152  
3153 3153          /*
3154 3154           * If we are importing a pool with missing top-level vdevs,
3155 3155           * we enforce that the pool doesn't panic or get suspended on
3156 3156           * error since the likelihood of missing data is extremely high.
3157 3157           */
3158 3158          if (spa->spa_missing_tvds > 0 &&
3159 3159              spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
3160 3160              spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3161 3161                  spa_load_note(spa, "forcing failmode to 'continue' "
3162 3162                      "as some top level vdevs are missing");
3163 3163                  spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
3164 3164          }
3165 3165  
3166 3166          return (0);
3167 3167  }
3168 3168  
3169 3169  static int
3170 3170  spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
3171 3171  {
3172 3172          int error = 0;
3173 3173          vdev_t *rvd = spa->spa_root_vdev;
3174 3174  
3175 3175          /*
3176 3176           * If we're assembling the pool from the split-off vdevs of
3177 3177           * an existing pool, we don't want to attach the spares & cache
3178 3178           * devices.
3179 3179           */
3180 3180  
3181 3181          /*
3182 3182           * Load any hot spares for this pool.
3183 3183           */
3184 3184          error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
3185 3185              B_FALSE);
3186 3186          if (error != 0 && error != ENOENT)
3187 3187                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3188 3188          if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3189 3189                  ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
3190 3190                  if (load_nvlist(spa, spa->spa_spares.sav_object,
3191 3191                      &spa->spa_spares.sav_config) != 0) {
3192 3192                          spa_load_failed(spa, "error loading spares nvlist");
3193 3193                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3194 3194                  }
3195 3195  
3196 3196                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3197 3197                  spa_load_spares(spa);
3198 3198                  spa_config_exit(spa, SCL_ALL, FTAG);
3199 3199          } else if (error == 0) {
3200 3200                  spa->spa_spares.sav_sync = B_TRUE;
3201 3201          }
3202 3202  
3203 3203          /*
3204 3204           * Load any level 2 ARC devices for this pool.
3205 3205           */
3206 3206          error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
3207 3207              &spa->spa_l2cache.sav_object, B_FALSE);
3208 3208          if (error != 0 && error != ENOENT)
3209 3209                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3210 3210          if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
3211 3211                  ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
3212 3212                  if (load_nvlist(spa, spa->spa_l2cache.sav_object,
3213 3213                      &spa->spa_l2cache.sav_config) != 0) {
3214 3214                          spa_load_failed(spa, "error loading l2cache nvlist");
3215 3215                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3216 3216                  }
3217 3217  
3218 3218                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3219 3219                  spa_load_l2cache(spa);
3220 3220                  spa_config_exit(spa, SCL_ALL, FTAG);
3221 3221          } else if (error == 0) {
3222 3222                  spa->spa_l2cache.sav_sync = B_TRUE;
3223 3223          }
3224 3224  
3225 3225          return (0);
3226 3226  }
3227 3227  
3228 3228  static int
3229 3229  spa_ld_load_vdev_metadata(spa_t *spa)
3230 3230  {
3231 3231          int error = 0;
3232 3232          vdev_t *rvd = spa->spa_root_vdev;
3233 3233  
3234 3234          /*
3235 3235           * If the 'autoreplace' property is set, then post a resource notifying
3236 3236           * the ZFS DE that it should not issue any faults for unopenable
3237 3237           * devices.  We also iterate over the vdevs, and post a sysevent for any
3238 3238           * unopenable vdevs so that the normal autoreplace handler can take
3239 3239           * over.
3240 3240           */
3241 3241          if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3242 3242                  spa_check_removed(spa->spa_root_vdev);
3243 3243                  /*
3244 3244                   * For the import case, this is done in spa_import(), because
3245 3245                   * at this point we're using the spare definitions from
3246 3246                   * the MOS config, not necessarily from the userland config.
3247 3247                   */
3248 3248                  if (spa->spa_load_state != SPA_LOAD_IMPORT) {
3249 3249                          spa_aux_check_removed(&spa->spa_spares);
3250 3250                          spa_aux_check_removed(&spa->spa_l2cache);
3251 3251                  }
3252 3252          }
3253 3253  
3254 3254          /*
3255 3255           * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
3256 3256           */
3257 3257          error = vdev_load(rvd);
3258 3258          if (error != 0) {
3259 3259                  spa_load_failed(spa, "vdev_load failed [error=%d]", error);
3260 3260                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
3261 3261          }
3262 3262  
3263 3263          /*
3264 3264           * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
3265 3265           */
3266 3266          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3267 3267          vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
3268 3268          spa_config_exit(spa, SCL_ALL, FTAG);
3269 3269  
3270 3270          return (0);
3271 3271  }
3272 3272  
3273 3273  static int
3274 3274  spa_ld_load_dedup_tables(spa_t *spa)
3275 3275  {
3276 3276          int error = 0;
3277 3277          vdev_t *rvd = spa->spa_root_vdev;
3278 3278  
3279 3279          error = ddt_load(spa);
3280 3280          if (error != 0) {
3281 3281                  spa_load_failed(spa, "ddt_load failed [error=%d]", error);
3282 3282                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
3283 3283          }
3284 3284  
3285 3285          return (0);
3286 3286  }
3287 3287  
3288 3288  static int
3289 3289  spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport)
3290 3290  {
3291 3291          vdev_t *rvd = spa->spa_root_vdev;
3292 3292  
3293 3293          if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
3294 3294                  boolean_t missing = spa_check_logs(spa);
3295 3295                  if (missing) {
3296 3296                          if (spa->spa_missing_tvds != 0) {
3297 3297                                  spa_load_note(spa, "spa_check_logs failed "
3298 3298                                      "so dropping the logs");
3299 3299                          } else {
3300 3300                                  *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
3301 3301                                  spa_load_failed(spa, "spa_check_logs failed");
3302 3302                                  return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
3303 3303                                      ENXIO));
3304 3304                          }
3305 3305                  }
3306 3306          }
3307 3307  
3308 3308          return (0);
3309 3309  }
3310 3310  
3311 3311  static int
3312 3312  spa_ld_verify_pool_data(spa_t *spa)
3313 3313  {
3314 3314          int error = 0;
3315 3315          vdev_t *rvd = spa->spa_root_vdev;
3316 3316  
3317 3317          /*
3318 3318           * We've successfully opened the pool, verify that we're ready
3319 3319           * to start pushing transactions.
3320 3320           */
3321 3321          if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
3322 3322                  error = spa_load_verify(spa);
3323 3323                  if (error != 0) {
3324 3324                          spa_load_failed(spa, "spa_load_verify failed "
3325 3325                              "[error=%d]", error);
3326 3326                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
3327 3327                              error));
3328 3328                  }
3329 3329          }
3330 3330  
3331 3331          return (0);
3332 3332  }
3333 3333  
3334 3334  static void
3335 3335  spa_ld_claim_log_blocks(spa_t *spa)
3336 3336  {
3337 3337          dmu_tx_t *tx;
3338 3338          dsl_pool_t *dp = spa_get_dsl(spa);
3339 3339  
3340 3340          /*
3341 3341           * Claim log blocks that haven't been committed yet.
3342 3342           * This must all happen in a single txg.
3343 3343           * Note: spa_claim_max_txg is updated by spa_claim_notify(),
3344 3344           * invoked from zil_claim_log_block()'s i/o done callback.
3345 3345           * Price of rollback is that we abandon the log.
3346 3346           */
3347 3347          spa->spa_claiming = B_TRUE;
3348 3348  
3349 3349          tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
3350 3350          (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
3351 3351              zil_claim, tx, DS_FIND_CHILDREN);
3352 3352          dmu_tx_commit(tx);
3353 3353  
3354 3354          spa->spa_claiming = B_FALSE;
3355 3355  
3356 3356          spa_set_log_state(spa, SPA_LOG_GOOD);
3357 3357  }
3358 3358  
3359 3359  static void
3360 3360  spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
3361 3361      boolean_t update_config_cache)
3362 3362  {
3363 3363          vdev_t *rvd = spa->spa_root_vdev;
3364 3364          int need_update = B_FALSE;
3365 3365  
3366 3366          /*
3367 3367           * If the config cache is stale, or we have uninitialized
3368 3368           * metaslabs (see spa_vdev_add()), then update the config.
3369 3369           *
3370 3370           * If this is a verbatim import, trust the current
3371 3371           * in-core spa_config and update the disk labels.
3372 3372           */
3373 3373          if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
3374 3374              spa->spa_load_state == SPA_LOAD_IMPORT ||
3375 3375              spa->spa_load_state == SPA_LOAD_RECOVER ||
3376 3376              (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
3377 3377                  need_update = B_TRUE;
3378 3378  
3379 3379          for (int c = 0; c < rvd->vdev_children; c++)
3380 3380                  if (rvd->vdev_child[c]->vdev_ms_array == 0)
3381 3381                          need_update = B_TRUE;
3382 3382  
3383 3383          /*
3384 3384           * Update the config cache asychronously in case we're the
3385 3385           * root pool, in which case the config cache isn't writable yet.
3386 3386           */
3387 3387          if (need_update)
3388 3388                  spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
3389 3389  }
3390 3390  
3391 3391  static void
3392 3392  spa_ld_prepare_for_reload(spa_t *spa)
3393 3393  {
3394 3394          int mode = spa->spa_mode;
3395 3395          int async_suspended = spa->spa_async_suspended;
3396 3396  
3397 3397          spa_unload(spa);
3398 3398          spa_deactivate(spa);
3399 3399          spa_activate(spa, mode);
3400 3400  
3401 3401          /*
3402 3402           * We save the value of spa_async_suspended as it gets reset to 0 by
3403 3403           * spa_unload(). We want to restore it back to the original value before
3404 3404           * returning as we might be calling spa_async_resume() later.
3405 3405           */
3406 3406          spa->spa_async_suspended = async_suspended;
3407 3407  }
3408 3408  
3409 3409  static int
3410 3410  spa_ld_read_checkpoint_txg(spa_t *spa)
3411 3411  {
3412 3412          uberblock_t checkpoint;
3413 3413          int error = 0;
3414 3414  
3415 3415          ASSERT0(spa->spa_checkpoint_txg);
3416 3416          ASSERT(MUTEX_HELD(&spa_namespace_lock));
3417 3417  
3418 3418          error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3419 3419              DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3420 3420              sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3421 3421  
3422 3422          if (error == ENOENT)
3423 3423                  return (0);
3424 3424  
3425 3425          if (error != 0)
3426 3426                  return (error);
3427 3427  
3428 3428          ASSERT3U(checkpoint.ub_txg, !=, 0);
3429 3429          ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
3430 3430          ASSERT3U(checkpoint.ub_timestamp, !=, 0);
3431 3431          spa->spa_checkpoint_txg = checkpoint.ub_txg;
3432 3432          spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
3433 3433  
3434 3434          return (0);
3435 3435  }
3436 3436  
3437 3437  static int
3438 3438  spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
3439 3439  {
3440 3440          int error = 0;
3441 3441  
3442 3442          ASSERT(MUTEX_HELD(&spa_namespace_lock));
3443 3443          ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3444 3444  
3445 3445          /*
3446 3446           * Never trust the config that is provided unless we are assembling
3447 3447           * a pool following a split.
3448 3448           * This means don't trust blkptrs and the vdev tree in general. This
3449 3449           * also effectively puts the spa in read-only mode since
3450 3450           * spa_writeable() checks for spa_trust_config to be true.
3451 3451           * We will later load a trusted config from the MOS.
3452 3452           */
3453 3453          if (type != SPA_IMPORT_ASSEMBLE)
3454 3454                  spa->spa_trust_config = B_FALSE;
3455 3455  
3456 3456          /*
3457 3457           * Parse the config provided to create a vdev tree.
3458 3458           */
3459 3459          error = spa_ld_parse_config(spa, type);
3460 3460          if (error != 0)
3461 3461                  return (error);
3462 3462  
3463 3463          /*
3464 3464           * Now that we have the vdev tree, try to open each vdev. This involves
3465 3465           * opening the underlying physical device, retrieving its geometry and
3466 3466           * probing the vdev with a dummy I/O. The state of each vdev will be set
3467 3467           * based on the success of those operations. After this we'll be ready
3468 3468           * to read from the vdevs.
3469 3469           */
3470 3470          error = spa_ld_open_vdevs(spa);
3471 3471          if (error != 0)
3472 3472                  return (error);
3473 3473  
3474 3474          /*
3475 3475           * Read the label of each vdev and make sure that the GUIDs stored
3476 3476           * there match the GUIDs in the config provided.
3477 3477           * If we're assembling a new pool that's been split off from an
3478 3478           * existing pool, the labels haven't yet been updated so we skip
3479 3479           * validation for now.
3480 3480           */
3481 3481          if (type != SPA_IMPORT_ASSEMBLE) {
3482 3482                  error = spa_ld_validate_vdevs(spa);
3483 3483                  if (error != 0)
3484 3484                          return (error);
3485 3485          }
3486 3486  
3487 3487          /*
3488 3488           * Read all vdev labels to find the best uberblock (i.e. latest,
3489 3489           * unless spa_load_max_txg is set) and store it in spa_uberblock. We
3490 3490           * get the list of features required to read blkptrs in the MOS from
3491 3491           * the vdev label with the best uberblock and verify that our version
3492 3492           * of zfs supports them all.
3493 3493           */
3494 3494          error = spa_ld_select_uberblock(spa, type);
3495 3495          if (error != 0)
3496 3496                  return (error);
3497 3497  
3498 3498          /*
3499 3499           * Pass that uberblock to the dsl_pool layer which will open the root
3500 3500           * blkptr. This blkptr points to the latest version of the MOS and will
3501 3501           * allow us to read its contents.
3502 3502           */
3503 3503          error = spa_ld_open_rootbp(spa);
3504 3504          if (error != 0)
3505 3505                  return (error);
3506 3506  
3507 3507          return (0);
3508 3508  }
3509 3509  
3510 3510  static int
3511 3511  spa_ld_checkpoint_rewind(spa_t *spa)
3512 3512  {
3513 3513          uberblock_t checkpoint;
3514 3514          int error = 0;
3515 3515  
3516 3516          ASSERT(MUTEX_HELD(&spa_namespace_lock));
3517 3517          ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3518 3518  
3519 3519          error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
3520 3520              DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
3521 3521              sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
3522 3522  
3523 3523          if (error != 0) {
3524 3524                  spa_load_failed(spa, "unable to retrieve checkpointed "
3525 3525                      "uberblock from the MOS config [error=%d]", error);
3526 3526  
3527 3527                  if (error == ENOENT)
3528 3528                          error = ZFS_ERR_NO_CHECKPOINT;
3529 3529  
3530 3530                  return (error);
3531 3531          }
3532 3532  
3533 3533          ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
3534 3534          ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
3535 3535  
3536 3536          /*
3537 3537           * We need to update the txg and timestamp of the checkpointed
3538 3538           * uberblock to be higher than the latest one. This ensures that
3539 3539           * the checkpointed uberblock is selected if we were to close and
3540 3540           * reopen the pool right after we've written it in the vdev labels.
3541 3541           * (also see block comment in vdev_uberblock_compare)
3542 3542           */
3543 3543          checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
3544 3544          checkpoint.ub_timestamp = gethrestime_sec();
3545 3545  
3546 3546          /*
3547 3547           * Set current uberblock to be the checkpointed uberblock.
3548 3548           */
3549 3549          spa->spa_uberblock = checkpoint;
3550 3550  
3551 3551          /*
3552 3552           * If we are doing a normal rewind, then the pool is open for
3553 3553           * writing and we sync the "updated" checkpointed uberblock to
3554 3554           * disk. Once this is done, we've basically rewound the whole
3555 3555           * pool and there is no way back.
3556 3556           *
3557 3557           * There are cases when we don't want to attempt and sync the
3558 3558           * checkpointed uberblock to disk because we are opening a
3559 3559           * pool as read-only. Specifically, verifying the checkpointed
3560 3560           * state with zdb, and importing the checkpointed state to get
3561 3561           * a "preview" of its content.
3562 3562           */
3563 3563          if (spa_writeable(spa)) {
3564 3564                  vdev_t *rvd = spa->spa_root_vdev;
3565 3565  
3566 3566                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3567 3567                  vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
3568 3568                  int svdcount = 0;
3569 3569                  int children = rvd->vdev_children;
3570 3570                  int c0 = spa_get_random(children);
3571 3571  
3572 3572                  for (int c = 0; c < children; c++) {
3573 3573                          vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
3574 3574  
3575 3575                          /* Stop when revisiting the first vdev */
3576 3576                          if (c > 0 && svd[0] == vd)
3577 3577                                  break;
3578 3578  
3579 3579                          if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
3580 3580                              !vdev_is_concrete(vd))
3581 3581                                  continue;
3582 3582  
3583 3583                          svd[svdcount++] = vd;
3584 3584                          if (svdcount == SPA_SYNC_MIN_VDEVS)
3585 3585                                  break;
3586 3586                  }
3587 3587                  error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
3588 3588                  if (error == 0)
3589 3589                          spa->spa_last_synced_guid = rvd->vdev_guid;
3590 3590                  spa_config_exit(spa, SCL_ALL, FTAG);
3591 3591  
3592 3592                  if (error != 0) {
3593 3593                          spa_load_failed(spa, "failed to write checkpointed "
3594 3594                              "uberblock to the vdev labels [error=%d]", error);
3595 3595                          return (error);
3596 3596                  }
3597 3597          }
3598 3598  
3599 3599          return (0);
3600 3600  }
3601 3601  
3602 3602  static int
3603 3603  spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
3604 3604      boolean_t *update_config_cache)
3605 3605  {
3606 3606          int error;
3607 3607  
3608 3608          /*
3609 3609           * Parse the config for pool, open and validate vdevs,
3610 3610           * select an uberblock, and use that uberblock to open
3611 3611           * the MOS.
3612 3612           */
3613 3613          error = spa_ld_mos_init(spa, type);
3614 3614          if (error != 0)
3615 3615                  return (error);
3616 3616  
3617 3617          /*
3618 3618           * Retrieve the trusted config stored in the MOS and use it to create
3619 3619           * a new, exact version of the vdev tree, then reopen all vdevs.
3620 3620           */
3621 3621          error = spa_ld_trusted_config(spa, type, B_FALSE);
3622 3622          if (error == EAGAIN) {
3623 3623                  if (update_config_cache != NULL)
3624 3624                          *update_config_cache = B_TRUE;
3625 3625  
3626 3626                  /*
3627 3627                   * Redo the loading process with the trusted config if it is
3628 3628                   * too different from the untrusted config.
3629 3629                   */
3630 3630                  spa_ld_prepare_for_reload(spa);
3631 3631                  spa_load_note(spa, "RELOADING");
3632 3632                  error = spa_ld_mos_init(spa, type);
3633 3633                  if (error != 0)
3634 3634                          return (error);
3635 3635  
3636 3636                  error = spa_ld_trusted_config(spa, type, B_TRUE);
3637 3637                  if (error != 0)
3638 3638                          return (error);
3639 3639  
3640 3640          } else if (error != 0) {
3641 3641                  return (error);
3642 3642          }
3643 3643  
3644 3644          return (0);
3645 3645  }
3646 3646  
3647 3647  /*
3648 3648   * Load an existing storage pool, using the config provided. This config
3649 3649   * describes which vdevs are part of the pool and is later validated against
3650 3650   * partial configs present in each vdev's label and an entire copy of the
3651 3651   * config stored in the MOS.
3652 3652   */
3653 3653  static int
3654 3654  spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport)
3655 3655  {
3656 3656          int error = 0;
3657 3657          boolean_t missing_feat_write = B_FALSE;
3658 3658          boolean_t checkpoint_rewind =
3659 3659              (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3660 3660          boolean_t update_config_cache = B_FALSE;
3661 3661  
3662 3662          ASSERT(MUTEX_HELD(&spa_namespace_lock));
3663 3663          ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
3664 3664  
3665 3665          spa_load_note(spa, "LOADING");
3666 3666  
3667 3667          error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
3668 3668          if (error != 0)
3669 3669                  return (error);
3670 3670  
3671 3671          /*
3672 3672           * If we are rewinding to the checkpoint then we need to repeat
3673 3673           * everything we've done so far in this function but this time
3674 3674           * selecting the checkpointed uberblock and using that to open
3675 3675           * the MOS.
3676 3676           */
3677 3677          if (checkpoint_rewind) {
3678 3678                  /*
3679 3679                   * If we are rewinding to the checkpoint update config cache
3680 3680                   * anyway.
3681 3681                   */
3682 3682                  update_config_cache = B_TRUE;
3683 3683  
3684 3684                  /*
3685 3685                   * Extract the checkpointed uberblock from the current MOS
3686 3686                   * and use this as the pool's uberblock from now on. If the
3687 3687                   * pool is imported as writeable we also write the checkpoint
3688 3688                   * uberblock to the labels, making the rewind permanent.
3689 3689                   */
3690 3690                  error = spa_ld_checkpoint_rewind(spa);
3691 3691                  if (error != 0)
3692 3692                          return (error);
3693 3693  
3694 3694                  /*
3695 3695                   * Redo the loading process process again with the
3696 3696                   * checkpointed uberblock.
3697 3697                   */
3698 3698                  spa_ld_prepare_for_reload(spa);
3699 3699                  spa_load_note(spa, "LOADING checkpointed uberblock");
3700 3700                  error = spa_ld_mos_with_trusted_config(spa, type, NULL);
3701 3701                  if (error != 0)
3702 3702                          return (error);
3703 3703          }
3704 3704  
3705 3705          /*
3706 3706           * Retrieve the checkpoint txg if the pool has a checkpoint.
3707 3707           */
3708 3708          error = spa_ld_read_checkpoint_txg(spa);
3709 3709          if (error != 0)
3710 3710                  return (error);
3711 3711  
3712 3712          /*
3713 3713           * Retrieve the mapping of indirect vdevs. Those vdevs were removed
3714 3714           * from the pool and their contents were re-mapped to other vdevs. Note
3715 3715           * that everything that we read before this step must have been
3716 3716           * rewritten on concrete vdevs after the last device removal was
3717 3717           * initiated. Otherwise we could be reading from indirect vdevs before
3718 3718           * we have loaded their mappings.
3719 3719           */
3720 3720          error = spa_ld_open_indirect_vdev_metadata(spa);
3721 3721          if (error != 0)
3722 3722                  return (error);
3723 3723  
3724 3724          /*
3725 3725           * Retrieve the full list of active features from the MOS and check if
3726 3726           * they are all supported.
3727 3727           */
3728 3728          error = spa_ld_check_features(spa, &missing_feat_write);
3729 3729          if (error != 0)
3730 3730                  return (error);
3731 3731  
3732 3732          /*
3733 3733           * Load several special directories from the MOS needed by the dsl_pool
3734 3734           * layer.
3735 3735           */
3736 3736          error = spa_ld_load_special_directories(spa);
3737 3737          if (error != 0)
3738 3738                  return (error);
3739 3739  
3740 3740          /*
3741 3741           * Retrieve pool properties from the MOS.
3742 3742           */
3743 3743          error = spa_ld_get_props(spa);
3744 3744          if (error != 0)
3745 3745                  return (error);
3746 3746  
3747 3747          /*
3748 3748           * Retrieve the list of auxiliary devices - cache devices and spares -
3749 3749           * and open them.
3750 3750           */
3751 3751          error = spa_ld_open_aux_vdevs(spa, type);
3752 3752          if (error != 0)
3753 3753                  return (error);
3754 3754  
3755 3755          /*
3756 3756           * Load the metadata for all vdevs. Also check if unopenable devices
3757 3757           * should be autoreplaced.
3758 3758           */
3759 3759          error = spa_ld_load_vdev_metadata(spa);
3760 3760          if (error != 0)
3761 3761                  return (error);
3762 3762  
3763 3763          error = spa_ld_load_dedup_tables(spa);
3764 3764          if (error != 0)
3765 3765                  return (error);
3766 3766  
3767 3767          /*
3768 3768           * Verify the logs now to make sure we don't have any unexpected errors
3769 3769           * when we claim log blocks later.
3770 3770           */
3771 3771          error = spa_ld_verify_logs(spa, type, ereport);
3772 3772          if (error != 0)
3773 3773                  return (error);
3774 3774  
3775 3775          if (missing_feat_write) {
3776 3776                  ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
3777 3777  
3778 3778                  /*
3779 3779                   * At this point, we know that we can open the pool in
3780 3780                   * read-only mode but not read-write mode. We now have enough
3781 3781                   * information and can return to userland.
3782 3782                   */
3783 3783                  return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
3784 3784                      ENOTSUP));
3785 3785          }
3786 3786  
3787 3787          /*
3788 3788           * Traverse the last txgs to make sure the pool was left off in a safe
3789 3789           * state. When performing an extreme rewind, we verify the whole pool,
3790 3790           * which can take a very long time.
3791 3791           */
3792 3792          error = spa_ld_verify_pool_data(spa);
3793 3793          if (error != 0)
3794 3794                  return (error);
3795 3795  
3796 3796          /*
3797 3797           * Calculate the deflated space for the pool. This must be done before
3798 3798           * we write anything to the pool because we'd need to update the space
3799 3799           * accounting using the deflated sizes.
3800 3800           */
3801 3801          spa_update_dspace(spa);
3802 3802  
3803 3803          /*
3804 3804           * We have now retrieved all the information we needed to open the
3805 3805           * pool. If we are importing the pool in read-write mode, a few
3806 3806           * additional steps must be performed to finish the import.
3807 3807           */
3808 3808          if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
3809 3809              spa->spa_load_max_txg == UINT64_MAX)) {
3810 3810                  uint64_t config_cache_txg = spa->spa_config_txg;
3811 3811  
3812 3812                  ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
3813 3813  
3814 3814                  /*
3815 3815                   * In case of a checkpoint rewind, log the original txg
3816 3816                   * of the checkpointed uberblock.
3817 3817                   */
3818 3818                  if (checkpoint_rewind) {
3819 3819                          spa_history_log_internal(spa, "checkpoint rewind",
3820 3820                              NULL, "rewound state to txg=%llu",
3821 3821                              (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
3822 3822                  }
3823 3823  
3824 3824                  /*
3825 3825                   * Traverse the ZIL and claim all blocks.
3826 3826                   */
3827 3827                  spa_ld_claim_log_blocks(spa);
3828 3828  
3829 3829                  /*
3830 3830                   * Kick-off the syncing thread.
3831 3831                   */
3832 3832                  spa->spa_sync_on = B_TRUE;
3833 3833                  txg_sync_start(spa->spa_dsl_pool);
3834 3834  
3835 3835                  /*
3836 3836                   * Wait for all claims to sync.  We sync up to the highest
3837 3837                   * claimed log block birth time so that claimed log blocks
3838 3838                   * don't appear to be from the future.  spa_claim_max_txg
3839 3839                   * will have been set for us by ZIL traversal operations
3840 3840                   * performed above.
3841 3841                   */
3842 3842                  txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
3843 3843  
3844 3844                  /*
3845 3845                   * Check if we need to request an update of the config. On the
3846 3846                   * next sync, we would update the config stored in vdev labels
3847 3847                   * and the cachefile (by default /etc/zfs/zpool.cache).
3848 3848                   */
3849 3849                  spa_ld_check_for_config_update(spa, config_cache_txg,
3850 3850                      update_config_cache);
3851 3851  
3852 3852                  /*
3853 3853                   * Check all DTLs to see if anything needs resilvering.
3854 3854                   */
3855 3855                  if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
3856 3856                      vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL))
3857 3857                          spa_async_request(spa, SPA_ASYNC_RESILVER);
3858 3858  
3859 3859                  /*
3860 3860                   * Log the fact that we booted up (so that we can detect if
3861 3861                   * we rebooted in the middle of an operation).
3862 3862                   */
3863 3863                  spa_history_log_version(spa, "open");
3864 3864  
3865 3865                  spa_restart_removal(spa);
3866 3866                  spa_spawn_aux_threads(spa);
3867 3867  
3868 3868                  /*
3869 3869                   * Delete any inconsistent datasets.
3870 3870                   *
3871 3871                   * Note:
3872 3872                   * Since we may be issuing deletes for clones here,
3873 3873                   * we make sure to do so after we've spawned all the
3874 3874                   * auxiliary threads above (from which the livelist
3875 3875                   * deletion zthr is part of).
3876 3876                   */
3877 3877                  (void) dmu_objset_find(spa_name(spa),
3878 3878                      dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
3879 3879  
3880 3880                  /*
3881 3881                   * Clean up any stale temporary dataset userrefs.
3882 3882                   */
3883 3883                  dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
3884 3884  
3885 3885                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3886 3886                  vdev_initialize_restart(spa->spa_root_vdev);
3887 3887                  spa_config_exit(spa, SCL_CONFIG, FTAG);
3888 3888          }
3889 3889  
3890 3890          spa_load_note(spa, "LOADED");
3891 3891  
3892 3892          return (0);
3893 3893  }
3894 3894  
3895 3895  static int
3896 3896  spa_load_retry(spa_t *spa, spa_load_state_t state)
3897 3897  {
3898 3898          int mode = spa->spa_mode;
3899 3899  
3900 3900          spa_unload(spa);
3901 3901          spa_deactivate(spa);
3902 3902  
3903 3903          spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
3904 3904  
3905 3905          spa_activate(spa, mode);
3906 3906          spa_async_suspend(spa);
3907 3907  
3908 3908          spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
3909 3909              (u_longlong_t)spa->spa_load_max_txg);
3910 3910  
3911 3911          return (spa_load(spa, state, SPA_IMPORT_EXISTING));
3912 3912  }
3913 3913  
3914 3914  /*
3915 3915   * If spa_load() fails this function will try loading prior txg's. If
3916 3916   * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
3917 3917   * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
3918 3918   * function will not rewind the pool and will return the same error as
3919 3919   * spa_load().
3920 3920   */
3921 3921  static int
3922 3922  spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
3923 3923      int rewind_flags)
3924 3924  {
3925 3925          nvlist_t *loadinfo = NULL;
3926 3926          nvlist_t *config = NULL;
3927 3927          int load_error, rewind_error;
3928 3928          uint64_t safe_rewind_txg;
3929 3929          uint64_t min_txg;
3930 3930  
3931 3931          if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
3932 3932                  spa->spa_load_max_txg = spa->spa_load_txg;
3933 3933                  spa_set_log_state(spa, SPA_LOG_CLEAR);
3934 3934          } else {
3935 3935                  spa->spa_load_max_txg = max_request;
3936 3936                  if (max_request != UINT64_MAX)
3937 3937                          spa->spa_extreme_rewind = B_TRUE;
3938 3938          }
3939 3939  
3940 3940          load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
3941 3941          if (load_error == 0)
3942 3942                  return (0);
3943 3943          if (load_error == ZFS_ERR_NO_CHECKPOINT) {
3944 3944                  /*
3945 3945                   * When attempting checkpoint-rewind on a pool with no
3946 3946                   * checkpoint, we should not attempt to load uberblocks
3947 3947                   * from previous txgs when spa_load fails.
3948 3948                   */
3949 3949                  ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
3950 3950                  return (load_error);
3951 3951          }
3952 3952  
3953 3953          if (spa->spa_root_vdev != NULL)
3954 3954                  config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3955 3955  
3956 3956          spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
3957 3957          spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
3958 3958  
3959 3959          if (rewind_flags & ZPOOL_NEVER_REWIND) {
3960 3960                  nvlist_free(config);
3961 3961                  return (load_error);
3962 3962          }
3963 3963  
3964 3964          if (state == SPA_LOAD_RECOVER) {
3965 3965                  /* Price of rolling back is discarding txgs, including log */
3966 3966                  spa_set_log_state(spa, SPA_LOG_CLEAR);
3967 3967          } else {
3968 3968                  /*
3969 3969                   * If we aren't rolling back save the load info from our first
3970 3970                   * import attempt so that we can restore it after attempting
3971 3971                   * to rewind.
3972 3972                   */
3973 3973                  loadinfo = spa->spa_load_info;
3974 3974                  spa->spa_load_info = fnvlist_alloc();
3975 3975          }
3976 3976  
3977 3977          spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
3978 3978          safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
3979 3979          min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
3980 3980              TXG_INITIAL : safe_rewind_txg;
3981 3981  
3982 3982          /*
3983 3983           * Continue as long as we're finding errors, we're still within
3984 3984           * the acceptable rewind range, and we're still finding uberblocks
3985 3985           */
3986 3986          while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
3987 3987              spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
3988 3988                  if (spa->spa_load_max_txg < safe_rewind_txg)
3989 3989                          spa->spa_extreme_rewind = B_TRUE;
3990 3990                  rewind_error = spa_load_retry(spa, state);
3991 3991          }
3992 3992  
3993 3993          spa->spa_extreme_rewind = B_FALSE;
3994 3994          spa->spa_load_max_txg = UINT64_MAX;
3995 3995  
3996 3996          if (config && (rewind_error || state != SPA_LOAD_RECOVER))
3997 3997                  spa_config_set(spa, config);
3998 3998          else
3999 3999                  nvlist_free(config);
4000 4000  
4001 4001          if (state == SPA_LOAD_RECOVER) {
4002 4002                  ASSERT3P(loadinfo, ==, NULL);
4003 4003                  return (rewind_error);
4004 4004          } else {
4005 4005                  /* Store the rewind info as part of the initial load info */
4006 4006                  fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
4007 4007                      spa->spa_load_info);
4008 4008  
4009 4009                  /* Restore the initial load info */
4010 4010                  fnvlist_free(spa->spa_load_info);
4011 4011                  spa->spa_load_info = loadinfo;
4012 4012  
4013 4013                  return (load_error);
4014 4014          }
4015 4015  }
4016 4016  
4017 4017  /*
4018 4018   * Pool Open/Import
4019 4019   *
4020 4020   * The import case is identical to an open except that the configuration is sent
4021 4021   * down from userland, instead of grabbed from the configuration cache.  For the
4022 4022   * case of an open, the pool configuration will exist in the
4023 4023   * POOL_STATE_UNINITIALIZED state.
4024 4024   *
4025 4025   * The stats information (gen/count/ustats) is used to gather vdev statistics at
4026 4026   * the same time open the pool, without having to keep around the spa_t in some
4027 4027   * ambiguous state.
4028 4028   */
4029 4029  static int
4030 4030  spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
4031 4031      nvlist_t **config)
4032 4032  {
4033 4033          spa_t *spa;
4034 4034          spa_load_state_t state = SPA_LOAD_OPEN;
4035 4035          int error;
4036 4036          int locked = B_FALSE;
4037 4037  
4038 4038          *spapp = NULL;
4039 4039  
4040 4040          /*
4041 4041           * As disgusting as this is, we need to support recursive calls to this
4042 4042           * function because dsl_dir_open() is called during spa_load(), and ends
4043 4043           * up calling spa_open() again.  The real fix is to figure out how to
4044 4044           * avoid dsl_dir_open() calling this in the first place.
4045 4045           */
4046 4046          if (mutex_owner(&spa_namespace_lock) != curthread) {
4047 4047                  mutex_enter(&spa_namespace_lock);
4048 4048                  locked = B_TRUE;
4049 4049          }
4050 4050  
4051 4051          if ((spa = spa_lookup(pool)) == NULL) {
4052 4052                  if (locked)
4053 4053                          mutex_exit(&spa_namespace_lock);
4054 4054                  return (SET_ERROR(ENOENT));
4055 4055          }
4056 4056  
4057 4057          if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
4058 4058                  zpool_load_policy_t policy;
4059 4059  
4060 4060                  zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
4061 4061                      &policy);
4062 4062                  if (policy.zlp_rewind & ZPOOL_DO_REWIND)
4063 4063                          state = SPA_LOAD_RECOVER;
4064 4064  
4065 4065                  spa_activate(spa, spa_mode_global);
4066 4066  
4067 4067                  if (state != SPA_LOAD_RECOVER)
4068 4068                          spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
4069 4069                  spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
4070 4070  
4071 4071                  zfs_dbgmsg("spa_open_common: opening %s", pool);
4072 4072                  error = spa_load_best(spa, state, policy.zlp_txg,
4073 4073                      policy.zlp_rewind);
4074 4074  
4075 4075                  if (error == EBADF) {
4076 4076                          /*
4077 4077                           * If vdev_validate() returns failure (indicated by
4078 4078                           * EBADF), it indicates that one of the vdevs indicates
4079 4079                           * that the pool has been exported or destroyed.  If
4080 4080                           * this is the case, the config cache is out of sync and
4081 4081                           * we should remove the pool from the namespace.
4082 4082                           */
4083 4083                          spa_unload(spa);
4084 4084                          spa_deactivate(spa);
4085 4085                          spa_write_cachefile(spa, B_TRUE, B_TRUE);
4086 4086                          spa_remove(spa);
4087 4087                          if (locked)
4088 4088                                  mutex_exit(&spa_namespace_lock);
4089 4089                          return (SET_ERROR(ENOENT));
4090 4090                  }
4091 4091  
4092 4092                  if (error) {
4093 4093                          /*
4094 4094                           * We can't open the pool, but we still have useful
4095 4095                           * information: the state of each vdev after the
4096 4096                           * attempted vdev_open().  Return this to the user.
4097 4097                           */
4098 4098                          if (config != NULL && spa->spa_config) {
4099 4099                                  VERIFY(nvlist_dup(spa->spa_config, config,
4100 4100                                      KM_SLEEP) == 0);
4101 4101                                  VERIFY(nvlist_add_nvlist(*config,
4102 4102                                      ZPOOL_CONFIG_LOAD_INFO,
4103 4103                                      spa->spa_load_info) == 0);
4104 4104                          }
4105 4105                          spa_unload(spa);
4106 4106                          spa_deactivate(spa);
4107 4107                          spa->spa_last_open_failed = error;
4108 4108                          if (locked)
4109 4109                                  mutex_exit(&spa_namespace_lock);
4110 4110                          *spapp = NULL;
4111 4111                          return (error);
4112 4112                  }
4113 4113          }
4114 4114  
4115 4115          spa_open_ref(spa, tag);
4116 4116  
4117 4117          if (config != NULL)
4118 4118                  *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
4119 4119  
4120 4120          /*
4121 4121           * If we've recovered the pool, pass back any information we
4122 4122           * gathered while doing the load.
4123 4123           */
4124 4124          if (state == SPA_LOAD_RECOVER) {
4125 4125                  VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
4126 4126                      spa->spa_load_info) == 0);
4127 4127          }
4128 4128  
4129 4129          if (locked) {
4130 4130                  spa->spa_last_open_failed = 0;
4131 4131                  spa->spa_last_ubsync_txg = 0;
4132 4132                  spa->spa_load_txg = 0;
4133 4133                  mutex_exit(&spa_namespace_lock);
4134 4134          }
4135 4135  
4136 4136          *spapp = spa;
4137 4137  
4138 4138          return (0);
4139 4139  }
4140 4140  
4141 4141  int
4142 4142  spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
4143 4143      nvlist_t **config)
4144 4144  {
4145 4145          return (spa_open_common(name, spapp, tag, policy, config));
4146 4146  }
4147 4147  
4148 4148  int
4149 4149  spa_open(const char *name, spa_t **spapp, void *tag)
4150 4150  {
4151 4151          return (spa_open_common(name, spapp, tag, NULL, NULL));
4152 4152  }
4153 4153  
4154 4154  /*
4155 4155   * Lookup the given spa_t, incrementing the inject count in the process,
4156 4156   * preventing it from being exported or destroyed.
4157 4157   */
4158 4158  spa_t *
4159 4159  spa_inject_addref(char *name)
4160 4160  {
4161 4161          spa_t *spa;
4162 4162  
4163 4163          mutex_enter(&spa_namespace_lock);
4164 4164          if ((spa = spa_lookup(name)) == NULL) {
4165 4165                  mutex_exit(&spa_namespace_lock);
4166 4166                  return (NULL);
4167 4167          }
4168 4168          spa->spa_inject_ref++;
4169 4169          mutex_exit(&spa_namespace_lock);
4170 4170  
4171 4171          return (spa);
4172 4172  }
4173 4173  
4174 4174  void
4175 4175  spa_inject_delref(spa_t *spa)
4176 4176  {
4177 4177          mutex_enter(&spa_namespace_lock);
4178 4178          spa->spa_inject_ref--;
4179 4179          mutex_exit(&spa_namespace_lock);
4180 4180  }
4181 4181  
4182 4182  /*
4183 4183   * Add spares device information to the nvlist.
4184 4184   */
4185 4185  static void
4186 4186  spa_add_spares(spa_t *spa, nvlist_t *config)
4187 4187  {
4188 4188          nvlist_t **spares;
4189 4189          uint_t i, nspares;
4190 4190          nvlist_t *nvroot;
4191 4191          uint64_t guid;
4192 4192          vdev_stat_t *vs;
4193 4193          uint_t vsc;
4194 4194          uint64_t pool;
4195 4195  
4196 4196          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4197 4197  
4198 4198          if (spa->spa_spares.sav_count == 0)
4199 4199                  return;
4200 4200  
4201 4201          VERIFY(nvlist_lookup_nvlist(config,
4202 4202              ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4203 4203          VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4204 4204              ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4205 4205          if (nspares != 0) {
4206 4206                  VERIFY(nvlist_add_nvlist_array(nvroot,
4207 4207                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4208 4208                  VERIFY(nvlist_lookup_nvlist_array(nvroot,
4209 4209                      ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
4210 4210  
4211 4211                  /*
4212 4212                   * Go through and find any spares which have since been
4213 4213                   * repurposed as an active spare.  If this is the case, update
4214 4214                   * their status appropriately.
4215 4215                   */
4216 4216                  for (i = 0; i < nspares; i++) {
4217 4217                          VERIFY(nvlist_lookup_uint64(spares[i],
4218 4218                              ZPOOL_CONFIG_GUID, &guid) == 0);
4219 4219                          if (spa_spare_exists(guid, &pool, NULL) &&
4220 4220                              pool != 0ULL) {
4221 4221                                  VERIFY(nvlist_lookup_uint64_array(
4222 4222                                      spares[i], ZPOOL_CONFIG_VDEV_STATS,
4223 4223                                      (uint64_t **)&vs, &vsc) == 0);
4224 4224                                  vs->vs_state = VDEV_STATE_CANT_OPEN;
4225 4225                                  vs->vs_aux = VDEV_AUX_SPARED;
4226 4226                          }
4227 4227                  }
4228 4228          }
4229 4229  }
4230 4230  
4231 4231  /*
4232 4232   * Add l2cache device information to the nvlist, including vdev stats.
4233 4233   */
4234 4234  static void
4235 4235  spa_add_l2cache(spa_t *spa, nvlist_t *config)
4236 4236  {
4237 4237          nvlist_t **l2cache;
4238 4238          uint_t i, j, nl2cache;
4239 4239          nvlist_t *nvroot;
4240 4240          uint64_t guid;
4241 4241          vdev_t *vd;
4242 4242          vdev_stat_t *vs;
4243 4243          uint_t vsc;
4244 4244  
4245 4245          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4246 4246  
4247 4247          if (spa->spa_l2cache.sav_count == 0)
4248 4248                  return;
4249 4249  
4250 4250          VERIFY(nvlist_lookup_nvlist(config,
4251 4251              ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
4252 4252          VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4253 4253              ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4254 4254          if (nl2cache != 0) {
4255 4255                  VERIFY(nvlist_add_nvlist_array(nvroot,
4256 4256                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4257 4257                  VERIFY(nvlist_lookup_nvlist_array(nvroot,
4258 4258                      ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
4259 4259  
4260 4260                  /*
4261 4261                   * Update level 2 cache device stats.
4262 4262                   */
4263 4263  
4264 4264                  for (i = 0; i < nl2cache; i++) {
4265 4265                          VERIFY(nvlist_lookup_uint64(l2cache[i],
4266 4266                              ZPOOL_CONFIG_GUID, &guid) == 0);
4267 4267  
4268 4268                          vd = NULL;
4269 4269                          for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
4270 4270                                  if (guid ==
4271 4271                                      spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
4272 4272                                          vd = spa->spa_l2cache.sav_vdevs[j];
4273 4273                                          break;
4274 4274                                  }
4275 4275                          }
4276 4276                          ASSERT(vd != NULL);
4277 4277  
4278 4278                          VERIFY(nvlist_lookup_uint64_array(l2cache[i],
4279 4279                              ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
4280 4280                              == 0);
4281 4281                          vdev_get_stats(vd, vs);
4282 4282                  }
4283 4283          }
4284 4284  }
4285 4285  
4286 4286  static void
4287 4287  spa_add_feature_stats(spa_t *spa, nvlist_t *config)
4288 4288  {
4289 4289          nvlist_t *features;
4290 4290          zap_cursor_t zc;
4291 4291          zap_attribute_t za;
4292 4292  
4293 4293          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
4294 4294          VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4295 4295  
4296 4296          if (spa->spa_feat_for_read_obj != 0) {
4297 4297                  for (zap_cursor_init(&zc, spa->spa_meta_objset,
4298 4298                      spa->spa_feat_for_read_obj);
4299 4299                      zap_cursor_retrieve(&zc, &za) == 0;
4300 4300                      zap_cursor_advance(&zc)) {
4301 4301                          ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4302 4302                              za.za_num_integers == 1);
4303 4303                          VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4304 4304                              za.za_first_integer));
4305 4305                  }
4306 4306                  zap_cursor_fini(&zc);
4307 4307          }
4308 4308  
4309 4309          if (spa->spa_feat_for_write_obj != 0) {
4310 4310                  for (zap_cursor_init(&zc, spa->spa_meta_objset,
4311 4311                      spa->spa_feat_for_write_obj);
4312 4312                      zap_cursor_retrieve(&zc, &za) == 0;
4313 4313                      zap_cursor_advance(&zc)) {
4314 4314                          ASSERT(za.za_integer_length == sizeof (uint64_t) &&
4315 4315                              za.za_num_integers == 1);
4316 4316                          VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
4317 4317                              za.za_first_integer));
4318 4318                  }
4319 4319                  zap_cursor_fini(&zc);
4320 4320          }
4321 4321  
4322 4322          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
4323 4323              features) == 0);
4324 4324          nvlist_free(features);
4325 4325  }
4326 4326  
4327 4327  int
4328 4328  spa_get_stats(const char *name, nvlist_t **config,
4329 4329      char *altroot, size_t buflen)
4330 4330  {
4331 4331          int error;
4332 4332          spa_t *spa;
4333 4333  
4334 4334          *config = NULL;
4335 4335          error = spa_open_common(name, &spa, FTAG, NULL, config);
4336 4336  
4337 4337          if (spa != NULL) {
4338 4338                  /*
4339 4339                   * This still leaves a window of inconsistency where the spares
4340 4340                   * or l2cache devices could change and the config would be
4341 4341                   * self-inconsistent.
4342 4342                   */
4343 4343                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
4344 4344  
4345 4345                  if (*config != NULL) {
4346 4346                          uint64_t loadtimes[2];
4347 4347  
4348 4348                          loadtimes[0] = spa->spa_loaded_ts.tv_sec;
4349 4349                          loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
4350 4350                          VERIFY(nvlist_add_uint64_array(*config,
4351 4351                              ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
4352 4352  
4353 4353                          VERIFY(nvlist_add_uint64(*config,
4354 4354                              ZPOOL_CONFIG_ERRCOUNT,
4355 4355                              spa_get_errlog_size(spa)) == 0);
4356 4356  
4357 4357                          if (spa_suspended(spa))
4358 4358                                  VERIFY(nvlist_add_uint64(*config,
4359 4359                                      ZPOOL_CONFIG_SUSPENDED,
4360 4360                                      spa->spa_failmode) == 0);
4361 4361  
4362 4362                          spa_add_spares(spa, *config);
4363 4363                          spa_add_l2cache(spa, *config);
4364 4364                          spa_add_feature_stats(spa, *config);
4365 4365                  }
4366 4366          }
4367 4367  
4368 4368          /*
4369 4369           * We want to get the alternate root even for faulted pools, so we cheat
4370 4370           * and call spa_lookup() directly.
4371 4371           */
4372 4372          if (altroot) {
4373 4373                  if (spa == NULL) {
4374 4374                          mutex_enter(&spa_namespace_lock);
4375 4375                          spa = spa_lookup(name);
4376 4376                          if (spa)
4377 4377                                  spa_altroot(spa, altroot, buflen);
4378 4378                          else
4379 4379                                  altroot[0] = '\0';
4380 4380                          spa = NULL;
4381 4381                          mutex_exit(&spa_namespace_lock);
4382 4382                  } else {
4383 4383                          spa_altroot(spa, altroot, buflen);
4384 4384                  }
4385 4385          }
4386 4386  
4387 4387          if (spa != NULL) {
4388 4388                  spa_config_exit(spa, SCL_CONFIG, FTAG);
4389 4389                  spa_close(spa, FTAG);
4390 4390          }
4391 4391  
4392 4392          return (error);
4393 4393  }
4394 4394  
4395 4395  /*
4396 4396   * Validate that the auxiliary device array is well formed.  We must have an
4397 4397   * array of nvlists, each which describes a valid leaf vdev.  If this is an
4398 4398   * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
4399 4399   * specified, as long as they are well-formed.
4400 4400   */
4401 4401  static int
4402 4402  spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
4403 4403      spa_aux_vdev_t *sav, const char *config, uint64_t version,
4404 4404      vdev_labeltype_t label)
4405 4405  {
4406 4406          nvlist_t **dev;
4407 4407          uint_t i, ndev;
4408 4408          vdev_t *vd;
4409 4409          int error;
4410 4410  
4411 4411          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4412 4412  
4413 4413          /*
4414 4414           * It's acceptable to have no devs specified.
4415 4415           */
4416 4416          if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
4417 4417                  return (0);
4418 4418  
4419 4419          if (ndev == 0)
4420 4420                  return (SET_ERROR(EINVAL));
4421 4421  
4422 4422          /*
4423 4423           * Make sure the pool is formatted with a version that supports this
4424 4424           * device type.
4425 4425           */
4426 4426          if (spa_version(spa) < version)
4427 4427                  return (SET_ERROR(ENOTSUP));
4428 4428  
4429 4429          /*
4430 4430           * Set the pending device list so we correctly handle device in-use
4431 4431           * checking.
4432 4432           */
4433 4433          sav->sav_pending = dev;
4434 4434          sav->sav_npending = ndev;
4435 4435  
4436 4436          for (i = 0; i < ndev; i++) {
4437 4437                  if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
4438 4438                      mode)) != 0)
4439 4439                          goto out;
4440 4440  
4441 4441                  if (!vd->vdev_ops->vdev_op_leaf) {
4442 4442                          vdev_free(vd);
4443 4443                          error = SET_ERROR(EINVAL);
4444 4444                          goto out;
4445 4445                  }
4446 4446  
4447 4447                  /*
4448 4448                   * The L2ARC currently only supports disk devices in
4449 4449                   * kernel context.  For user-level testing, we allow it.
4450 4450                   */
4451 4451  #ifdef _KERNEL
4452 4452                  if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
4453 4453                      strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
4454 4454                          error = SET_ERROR(ENOTBLK);
4455 4455                          vdev_free(vd);
4456 4456                          goto out;
4457 4457                  }
4458 4458  #endif
4459 4459                  vd->vdev_top = vd;
4460 4460  
4461 4461                  if ((error = vdev_open(vd)) == 0 &&
4462 4462                      (error = vdev_label_init(vd, crtxg, label)) == 0) {
4463 4463                          VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
4464 4464                              vd->vdev_guid) == 0);
4465 4465                  }
4466 4466  
4467 4467                  vdev_free(vd);
4468 4468  
4469 4469                  if (error &&
4470 4470                      (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
4471 4471                          goto out;
4472 4472                  else
4473 4473                          error = 0;
4474 4474          }
4475 4475  
4476 4476  out:
4477 4477          sav->sav_pending = NULL;
4478 4478          sav->sav_npending = 0;
4479 4479          return (error);
4480 4480  }
4481 4481  
4482 4482  static int
4483 4483  spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
4484 4484  {
4485 4485          int error;
4486 4486  
4487 4487          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4488 4488  
4489 4489          if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4490 4490              &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
4491 4491              VDEV_LABEL_SPARE)) != 0) {
4492 4492                  return (error);
4493 4493          }
4494 4494  
4495 4495          return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
4496 4496              &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
4497 4497              VDEV_LABEL_L2CACHE));
4498 4498  }
4499 4499  
4500 4500  static void
4501 4501  spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
4502 4502      const char *config)
4503 4503  {
4504 4504          int i;
4505 4505  
4506 4506          if (sav->sav_config != NULL) {
4507 4507                  nvlist_t **olddevs;
4508 4508                  uint_t oldndevs;
4509 4509                  nvlist_t **newdevs;
4510 4510  
4511 4511                  /*
4512 4512                   * Generate new dev list by concatentating with the
4513 4513                   * current dev list.
4514 4514                   */
4515 4515                  VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
4516 4516                      &olddevs, &oldndevs) == 0);
4517 4517  
4518 4518                  newdevs = kmem_alloc(sizeof (void *) *
4519 4519                      (ndevs + oldndevs), KM_SLEEP);
4520 4520                  for (i = 0; i < oldndevs; i++)
4521 4521                          VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
4522 4522                              KM_SLEEP) == 0);
4523 4523                  for (i = 0; i < ndevs; i++)
4524 4524                          VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
4525 4525                              KM_SLEEP) == 0);
4526 4526  
4527 4527                  VERIFY(nvlist_remove(sav->sav_config, config,
4528 4528                      DATA_TYPE_NVLIST_ARRAY) == 0);
4529 4529  
4530 4530                  VERIFY(nvlist_add_nvlist_array(sav->sav_config,
4531 4531                      config, newdevs, ndevs + oldndevs) == 0);
4532 4532                  for (i = 0; i < oldndevs + ndevs; i++)
4533 4533                          nvlist_free(newdevs[i]);
4534 4534                  kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
4535 4535          } else {
4536 4536                  /*
4537 4537                   * Generate a new dev list.
4538 4538                   */
4539 4539                  VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
4540 4540                      KM_SLEEP) == 0);
4541 4541                  VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
4542 4542                      devs, ndevs) == 0);
4543 4543          }
4544 4544  }
4545 4545  
4546 4546  /*
4547 4547   * Stop and drop level 2 ARC devices
4548 4548   */
4549 4549  void
4550 4550  spa_l2cache_drop(spa_t *spa)
4551 4551  {
4552 4552          vdev_t *vd;
4553 4553          int i;
4554 4554          spa_aux_vdev_t *sav = &spa->spa_l2cache;
4555 4555  
4556 4556          for (i = 0; i < sav->sav_count; i++) {
4557 4557                  uint64_t pool;
4558 4558  
4559 4559                  vd = sav->sav_vdevs[i];
4560 4560                  ASSERT(vd != NULL);
4561 4561  
4562 4562                  if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
4563 4563                      pool != 0ULL && l2arc_vdev_present(vd))
4564 4564                          l2arc_remove_vdev(vd);
4565 4565          }
4566 4566  }
4567 4567  
4568 4568  /*
4569 4569   * Pool Creation
4570 4570   */
4571 4571  int
4572 4572  spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
4573 4573      nvlist_t *zplprops)
4574 4574  {
4575 4575          spa_t *spa;
  
    | 
      ↓ open down ↓ | 
    4575 lines elided | 
    
      ↑ open up ↑ | 
  
4576 4576          char *altroot = NULL;
4577 4577          vdev_t *rvd;
4578 4578          dsl_pool_t *dp;
4579 4579          dmu_tx_t *tx;
4580 4580          int error = 0;
4581 4581          uint64_t txg = TXG_INITIAL;
4582 4582          nvlist_t **spares, **l2cache;
4583 4583          uint_t nspares, nl2cache;
4584 4584          uint64_t version, obj;
4585 4585          boolean_t has_features;
     4586 +        char *poolname;
     4587 +        nvlist_t *nvl;
4586 4588  
     4589 +        if (nvlist_lookup_string(props,
     4590 +            zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
     4591 +                poolname = (char *)pool;
     4592 +
4587 4593          /*
4588 4594           * If this pool already exists, return failure.
4589 4595           */
4590 4596          mutex_enter(&spa_namespace_lock);
4591      -        if (spa_lookup(pool) != NULL) {
     4597 +        if (spa_lookup(poolname) != NULL) {
4592 4598                  mutex_exit(&spa_namespace_lock);
4593 4599                  return (SET_ERROR(EEXIST));
4594 4600          }
4595 4601  
4596 4602          /*
4597 4603           * Allocate a new spa_t structure.
4598 4604           */
     4605 +        nvl = fnvlist_alloc();
     4606 +        fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
4599 4607          (void) nvlist_lookup_string(props,
4600 4608              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4601      -        spa = spa_add(pool, NULL, altroot);
     4609 +        spa = spa_add(poolname, nvl, altroot);
     4610 +        fnvlist_free(nvl);
4602 4611          spa_activate(spa, spa_mode_global);
4603 4612  
4604 4613          if (props && (error = spa_prop_validate(spa, props))) {
4605 4614                  spa_deactivate(spa);
4606 4615                  spa_remove(spa);
4607 4616                  mutex_exit(&spa_namespace_lock);
4608 4617                  return (error);
4609 4618          }
4610 4619  
     4620 +        /*
     4621 +         * Temporary pool names should never be written to disk.
     4622 +         */
     4623 +        if (poolname != pool)
     4624 +                spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
     4625 +
4611 4626          has_features = B_FALSE;
4612 4627          for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
4613 4628              elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
4614 4629                  if (zpool_prop_feature(nvpair_name(elem)))
4615 4630                          has_features = B_TRUE;
4616 4631          }
4617 4632  
4618 4633          if (has_features || nvlist_lookup_uint64(props,
4619 4634              zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
4620 4635                  version = SPA_VERSION;
4621 4636          }
4622 4637          ASSERT(SPA_VERSION_IS_SUPPORTED(version));
4623 4638  
4624 4639          spa->spa_first_txg = txg;
4625 4640          spa->spa_uberblock.ub_txg = txg - 1;
4626 4641          spa->spa_uberblock.ub_version = version;
4627 4642          spa->spa_ubsync = spa->spa_uberblock;
4628 4643          spa->spa_load_state = SPA_LOAD_CREATE;
4629 4644          spa->spa_removing_phys.sr_state = DSS_NONE;
4630 4645          spa->spa_removing_phys.sr_removing_vdev = -1;
4631 4646          spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
4632 4647  
4633 4648          /*
4634 4649           * Create "The Godfather" zio to hold all async IOs
4635 4650           */
4636 4651          spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
4637 4652              KM_SLEEP);
4638 4653          for (int i = 0; i < max_ncpus; i++) {
4639 4654                  spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
4640 4655                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
4641 4656                      ZIO_FLAG_GODFATHER);
4642 4657          }
4643 4658  
4644 4659          /*
4645 4660           * Create the root vdev.
4646 4661           */
4647 4662          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4648 4663  
4649 4664          error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
4650 4665  
4651 4666          ASSERT(error != 0 || rvd != NULL);
4652 4667          ASSERT(error != 0 || spa->spa_root_vdev == rvd);
4653 4668  
4654 4669          if (error == 0 && !zfs_allocatable_devs(nvroot))
4655 4670                  error = SET_ERROR(EINVAL);
4656 4671  
4657 4672          if (error == 0 &&
4658 4673              (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
4659 4674              (error = spa_validate_aux(spa, nvroot, txg,
4660 4675              VDEV_ALLOC_ADD)) == 0) {
4661 4676                  for (int c = 0; c < rvd->vdev_children; c++) {
4662 4677                          vdev_metaslab_set_size(rvd->vdev_child[c]);
4663 4678                          vdev_expand(rvd->vdev_child[c], txg);
4664 4679                  }
4665 4680          }
4666 4681  
4667 4682          spa_config_exit(spa, SCL_ALL, FTAG);
4668 4683  
4669 4684          if (error != 0) {
4670 4685                  spa_unload(spa);
4671 4686                  spa_deactivate(spa);
4672 4687                  spa_remove(spa);
4673 4688                  mutex_exit(&spa_namespace_lock);
4674 4689                  return (error);
4675 4690          }
4676 4691  
4677 4692          /*
4678 4693           * Get the list of spares, if specified.
4679 4694           */
4680 4695          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
4681 4696              &spares, &nspares) == 0) {
4682 4697                  VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
4683 4698                      KM_SLEEP) == 0);
4684 4699                  VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
4685 4700                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
4686 4701                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4687 4702                  spa_load_spares(spa);
4688 4703                  spa_config_exit(spa, SCL_ALL, FTAG);
4689 4704                  spa->spa_spares.sav_sync = B_TRUE;
4690 4705          }
4691 4706  
4692 4707          /*
4693 4708           * Get the list of level 2 cache devices, if specified.
4694 4709           */
4695 4710          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
4696 4711              &l2cache, &nl2cache) == 0) {
4697 4712                  VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
4698 4713                      NV_UNIQUE_NAME, KM_SLEEP) == 0);
4699 4714                  VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
4700 4715                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
4701 4716                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4702 4717                  spa_load_l2cache(spa);
4703 4718                  spa_config_exit(spa, SCL_ALL, FTAG);
4704 4719                  spa->spa_l2cache.sav_sync = B_TRUE;
4705 4720          }
4706 4721  
4707 4722          spa->spa_is_initializing = B_TRUE;
4708 4723          spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
4709 4724          spa->spa_meta_objset = dp->dp_meta_objset;
4710 4725          spa->spa_is_initializing = B_FALSE;
4711 4726  
4712 4727          /*
4713 4728           * Create DDTs (dedup tables).
4714 4729           */
4715 4730          ddt_create(spa);
4716 4731  
4717 4732          spa_update_dspace(spa);
4718 4733  
4719 4734          tx = dmu_tx_create_assigned(dp, txg);
4720 4735  
4721 4736          /*
4722 4737           * Create the pool config object.
4723 4738           */
4724 4739          spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
4725 4740              DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
4726 4741              DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
4727 4742  
4728 4743          if (zap_add(spa->spa_meta_objset,
4729 4744              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
4730 4745              sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
4731 4746                  cmn_err(CE_PANIC, "failed to add pool config");
4732 4747          }
4733 4748  
4734 4749          if (spa_version(spa) >= SPA_VERSION_FEATURES)
4735 4750                  spa_feature_create_zap_objects(spa, tx);
4736 4751  
4737 4752          if (zap_add(spa->spa_meta_objset,
4738 4753              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
4739 4754              sizeof (uint64_t), 1, &version, tx) != 0) {
4740 4755                  cmn_err(CE_PANIC, "failed to add pool version");
4741 4756          }
4742 4757  
4743 4758          /* Newly created pools with the right version are always deflated. */
4744 4759          if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
4745 4760                  spa->spa_deflate = TRUE;
4746 4761                  if (zap_add(spa->spa_meta_objset,
4747 4762                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
4748 4763                      sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
4749 4764                          cmn_err(CE_PANIC, "failed to add deflate");
4750 4765                  }
4751 4766          }
4752 4767  
4753 4768          /*
4754 4769           * Create the deferred-free bpobj.  Turn off compression
4755 4770           * because sync-to-convergence takes longer if the blocksize
4756 4771           * keeps changing.
4757 4772           */
4758 4773          obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
4759 4774          dmu_object_set_compress(spa->spa_meta_objset, obj,
4760 4775              ZIO_COMPRESS_OFF, tx);
4761 4776          if (zap_add(spa->spa_meta_objset,
4762 4777              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
4763 4778              sizeof (uint64_t), 1, &obj, tx) != 0) {
4764 4779                  cmn_err(CE_PANIC, "failed to add bpobj");
4765 4780          }
4766 4781          VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
4767 4782              spa->spa_meta_objset, obj));
4768 4783  
4769 4784          /*
4770 4785           * Create the pool's history object.
4771 4786           */
4772 4787          if (version >= SPA_VERSION_ZPOOL_HISTORY)
4773 4788                  spa_history_create_obj(spa, tx);
4774 4789  
4775 4790          /*
4776 4791           * Generate some random noise for salted checksums to operate on.
4777 4792           */
4778 4793          (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
4779 4794              sizeof (spa->spa_cksum_salt.zcs_bytes));
4780 4795  
4781 4796          /*
4782 4797           * Set pool properties.
4783 4798           */
4784 4799          spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
4785 4800          spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
4786 4801          spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
4787 4802          spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
4788 4803  
4789 4804          if (props != NULL) {
4790 4805                  spa_configfile_set(spa, props, B_FALSE);
4791 4806                  spa_sync_props(props, tx);
4792 4807          }
4793 4808  
4794 4809          dmu_tx_commit(tx);
4795 4810  
4796 4811          spa->spa_sync_on = B_TRUE;
4797 4812          txg_sync_start(spa->spa_dsl_pool);
4798 4813  
4799 4814          /*
4800 4815           * We explicitly wait for the first transaction to complete so that our
4801 4816           * bean counters are appropriately updated.
4802 4817           */
4803 4818          txg_wait_synced(spa->spa_dsl_pool, txg);
4804 4819  
4805 4820          spa_spawn_aux_threads(spa);
4806 4821  
4807 4822          spa_write_cachefile(spa, B_FALSE, B_TRUE);
4808 4823          spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
4809 4824  
4810 4825          spa_history_log_version(spa, "create");
4811 4826  
4812 4827          /*
4813 4828           * Don't count references from objsets that are already closed
4814 4829           * and are making their way through the eviction process.
4815 4830           */
4816 4831          spa_evicting_os_wait(spa);
4817 4832          spa->spa_minref = refcount_count(&spa->spa_refcount);
4818 4833          spa->spa_load_state = SPA_LOAD_NONE;
4819 4834  
4820 4835          mutex_exit(&spa_namespace_lock);
4821 4836  
4822 4837          return (0);
4823 4838  }
4824 4839  
4825 4840  #ifdef _KERNEL
4826 4841  /*
4827 4842   * Get the root pool information from the root disk, then import the root pool
4828 4843   * during the system boot up time.
4829 4844   */
4830 4845  extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
4831 4846  
4832 4847  static nvlist_t *
4833 4848  spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
4834 4849  {
4835 4850          nvlist_t *config;
4836 4851          nvlist_t *nvtop, *nvroot;
4837 4852          uint64_t pgid;
4838 4853  
4839 4854          if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
4840 4855                  return (NULL);
4841 4856  
4842 4857          /*
4843 4858           * Add this top-level vdev to the child array.
4844 4859           */
4845 4860          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4846 4861              &nvtop) == 0);
4847 4862          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4848 4863              &pgid) == 0);
4849 4864          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
4850 4865  
4851 4866          /*
4852 4867           * Put this pool's top-level vdevs into a root vdev.
4853 4868           */
4854 4869          VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4855 4870          VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
4856 4871              VDEV_TYPE_ROOT) == 0);
4857 4872          VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
4858 4873          VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
4859 4874          VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
4860 4875              &nvtop, 1) == 0);
4861 4876  
4862 4877          /*
4863 4878           * Replace the existing vdev_tree with the new root vdev in
4864 4879           * this pool's configuration (remove the old, add the new).
4865 4880           */
4866 4881          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
4867 4882          nvlist_free(nvroot);
4868 4883          return (config);
4869 4884  }
4870 4885  
4871 4886  /*
4872 4887   * Walk the vdev tree and see if we can find a device with "better"
4873 4888   * configuration. A configuration is "better" if the label on that
4874 4889   * device has a more recent txg.
4875 4890   */
4876 4891  static void
4877 4892  spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
4878 4893  {
4879 4894          for (int c = 0; c < vd->vdev_children; c++)
4880 4895                  spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
4881 4896  
4882 4897          if (vd->vdev_ops->vdev_op_leaf) {
4883 4898                  nvlist_t *label;
4884 4899                  uint64_t label_txg;
4885 4900  
4886 4901                  if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
4887 4902                      &label) != 0)
4888 4903                          return;
4889 4904  
4890 4905                  VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
4891 4906                      &label_txg) == 0);
4892 4907  
4893 4908                  /*
4894 4909                   * Do we have a better boot device?
4895 4910                   */
4896 4911                  if (label_txg > *txg) {
4897 4912                          *txg = label_txg;
4898 4913                          *avd = vd;
4899 4914                  }
4900 4915                  nvlist_free(label);
4901 4916          }
4902 4917  }
4903 4918  
4904 4919  /*
4905 4920   * Import a root pool.
4906 4921   *
4907 4922   * For x86. devpath_list will consist of devid and/or physpath name of
4908 4923   * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
4909 4924   * The GRUB "findroot" command will return the vdev we should boot.
4910 4925   *
4911 4926   * For Sparc, devpath_list consists the physpath name of the booting device
4912 4927   * no matter the rootpool is a single device pool or a mirrored pool.
4913 4928   * e.g.
4914 4929   *      "/pci@1f,0/ide@d/disk@0,0:a"
4915 4930   */
4916 4931  int
4917 4932  spa_import_rootpool(char *devpath, char *devid)
4918 4933  {
4919 4934          spa_t *spa;
4920 4935          vdev_t *rvd, *bvd, *avd = NULL;
4921 4936          nvlist_t *config, *nvtop;
4922 4937          uint64_t guid, txg;
4923 4938          char *pname;
4924 4939          int error;
4925 4940  
4926 4941          /*
4927 4942           * Read the label from the boot device and generate a configuration.
4928 4943           */
4929 4944          config = spa_generate_rootconf(devpath, devid, &guid);
4930 4945  #if defined(_OBP) && defined(_KERNEL)
4931 4946          if (config == NULL) {
4932 4947                  if (strstr(devpath, "/iscsi/ssd") != NULL) {
4933 4948                          /* iscsi boot */
4934 4949                          get_iscsi_bootpath_phy(devpath);
4935 4950                          config = spa_generate_rootconf(devpath, devid, &guid);
4936 4951                  }
4937 4952          }
4938 4953  #endif
4939 4954          if (config == NULL) {
4940 4955                  cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
4941 4956                      devpath);
4942 4957                  return (SET_ERROR(EIO));
4943 4958          }
4944 4959  
4945 4960          VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
4946 4961              &pname) == 0);
4947 4962          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
4948 4963  
4949 4964          mutex_enter(&spa_namespace_lock);
4950 4965          if ((spa = spa_lookup(pname)) != NULL) {
4951 4966                  /*
4952 4967                   * Remove the existing root pool from the namespace so that we
4953 4968                   * can replace it with the correct config we just read in.
4954 4969                   */
4955 4970                  spa_remove(spa);
4956 4971          }
4957 4972  
4958 4973          spa = spa_add(pname, config, NULL);
4959 4974          spa->spa_is_root = B_TRUE;
4960 4975          spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
4961 4976          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
4962 4977              &spa->spa_ubsync.ub_version) != 0)
4963 4978                  spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
4964 4979  
4965 4980          /*
4966 4981           * Build up a vdev tree based on the boot device's label config.
4967 4982           */
4968 4983          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
4969 4984              &nvtop) == 0);
4970 4985          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4971 4986          error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
4972 4987              VDEV_ALLOC_ROOTPOOL);
4973 4988          spa_config_exit(spa, SCL_ALL, FTAG);
4974 4989          if (error) {
4975 4990                  mutex_exit(&spa_namespace_lock);
4976 4991                  nvlist_free(config);
4977 4992                  cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
4978 4993                      pname);
4979 4994                  return (error);
4980 4995          }
4981 4996  
4982 4997          /*
4983 4998           * Get the boot vdev.
4984 4999           */
4985 5000          if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
4986 5001                  cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
4987 5002                      (u_longlong_t)guid);
4988 5003                  error = SET_ERROR(ENOENT);
4989 5004                  goto out;
4990 5005          }
4991 5006  
4992 5007          /*
4993 5008           * Determine if there is a better boot device.
4994 5009           */
4995 5010          avd = bvd;
4996 5011          spa_alt_rootvdev(rvd, &avd, &txg);
4997 5012          if (avd != bvd) {
4998 5013                  cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
4999 5014                      "try booting from '%s'", avd->vdev_path);
5000 5015                  error = SET_ERROR(EINVAL);
5001 5016                  goto out;
5002 5017          }
5003 5018  
5004 5019          /*
5005 5020           * If the boot device is part of a spare vdev then ensure that
5006 5021           * we're booting off the active spare.
5007 5022           */
5008 5023          if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
5009 5024              !bvd->vdev_isspare) {
5010 5025                  cmn_err(CE_NOTE, "The boot device is currently spared. Please "
5011 5026                      "try booting from '%s'",
5012 5027                      bvd->vdev_parent->
5013 5028                      vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
5014 5029                  error = SET_ERROR(EINVAL);
5015 5030                  goto out;
5016 5031          }
5017 5032  
5018 5033          error = 0;
5019 5034  out:
5020 5035          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5021 5036          vdev_free(rvd);
5022 5037          spa_config_exit(spa, SCL_ALL, FTAG);
5023 5038          mutex_exit(&spa_namespace_lock);
5024 5039  
5025 5040          nvlist_free(config);
5026 5041          return (error);
5027 5042  }
5028 5043  
5029 5044  #endif
5030 5045  
5031 5046  /*
5032 5047   * Import a non-root pool into the system.
5033 5048   */
5034 5049  int
5035 5050  spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
5036 5051  {
5037 5052          spa_t *spa;
5038 5053          char *altroot = NULL;
5039 5054          spa_load_state_t state = SPA_LOAD_IMPORT;
5040 5055          zpool_load_policy_t policy;
5041 5056          uint64_t mode = spa_mode_global;
5042 5057          uint64_t readonly = B_FALSE;
5043 5058          int error;
5044 5059          nvlist_t *nvroot;
5045 5060          nvlist_t **spares, **l2cache;
5046 5061          uint_t nspares, nl2cache;
5047 5062  
5048 5063          /*
5049 5064           * If a pool with this name exists, return failure.
5050 5065           */
5051 5066          mutex_enter(&spa_namespace_lock);
5052 5067          if (spa_lookup(pool) != NULL) {
5053 5068                  mutex_exit(&spa_namespace_lock);
5054 5069                  return (SET_ERROR(EEXIST));
5055 5070          }
5056 5071  
5057 5072          /*
5058 5073           * Create and initialize the spa structure.
5059 5074           */
5060 5075          (void) nvlist_lookup_string(props,
5061 5076              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
5062 5077          (void) nvlist_lookup_uint64(props,
5063 5078              zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
5064 5079          if (readonly)
5065 5080                  mode = FREAD;
5066 5081          spa = spa_add(pool, config, altroot);
5067 5082          spa->spa_import_flags = flags;
5068 5083  
5069 5084          /*
5070 5085           * Verbatim import - Take a pool and insert it into the namespace
5071 5086           * as if it had been loaded at boot.
5072 5087           */
5073 5088          if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
5074 5089                  if (props != NULL)
5075 5090                          spa_configfile_set(spa, props, B_FALSE);
5076 5091  
5077 5092                  spa_write_cachefile(spa, B_FALSE, B_TRUE);
5078 5093                  spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5079 5094                  zfs_dbgmsg("spa_import: verbatim import of %s", pool);
5080 5095                  mutex_exit(&spa_namespace_lock);
5081 5096                  return (0);
5082 5097          }
5083 5098  
5084 5099          spa_activate(spa, mode);
5085 5100  
5086 5101          /*
5087 5102           * Don't start async tasks until we know everything is healthy.
5088 5103           */
5089 5104          spa_async_suspend(spa);
5090 5105  
5091 5106          zpool_get_load_policy(config, &policy);
5092 5107          if (policy.zlp_rewind & ZPOOL_DO_REWIND)
5093 5108                  state = SPA_LOAD_RECOVER;
5094 5109  
5095 5110          spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
5096 5111  
5097 5112          if (state != SPA_LOAD_RECOVER) {
5098 5113                  spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
5099 5114                  zfs_dbgmsg("spa_import: importing %s", pool);
5100 5115          } else {
5101 5116                  zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
5102 5117                      "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
5103 5118          }
5104 5119          error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
5105 5120  
5106 5121          /*
5107 5122           * Propagate anything learned while loading the pool and pass it
5108 5123           * back to caller (i.e. rewind info, missing devices, etc).
5109 5124           */
5110 5125          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5111 5126              spa->spa_load_info) == 0);
5112 5127  
5113 5128          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5114 5129          /*
5115 5130           * Toss any existing sparelist, as it doesn't have any validity
5116 5131           * anymore, and conflicts with spa_has_spare().
5117 5132           */
5118 5133          if (spa->spa_spares.sav_config) {
5119 5134                  nvlist_free(spa->spa_spares.sav_config);
5120 5135                  spa->spa_spares.sav_config = NULL;
5121 5136                  spa_load_spares(spa);
5122 5137          }
5123 5138          if (spa->spa_l2cache.sav_config) {
5124 5139                  nvlist_free(spa->spa_l2cache.sav_config);
5125 5140                  spa->spa_l2cache.sav_config = NULL;
5126 5141                  spa_load_l2cache(spa);
5127 5142          }
5128 5143  
5129 5144          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
5130 5145              &nvroot) == 0);
5131 5146          if (error == 0)
5132 5147                  error = spa_validate_aux(spa, nvroot, -1ULL,
5133 5148                      VDEV_ALLOC_SPARE);
5134 5149          if (error == 0)
5135 5150                  error = spa_validate_aux(spa, nvroot, -1ULL,
5136 5151                      VDEV_ALLOC_L2CACHE);
5137 5152          spa_config_exit(spa, SCL_ALL, FTAG);
5138 5153  
5139 5154          if (props != NULL)
5140 5155                  spa_configfile_set(spa, props, B_FALSE);
5141 5156  
5142 5157          if (error != 0 || (props && spa_writeable(spa) &&
5143 5158              (error = spa_prop_set(spa, props)))) {
5144 5159                  spa_unload(spa);
5145 5160                  spa_deactivate(spa);
5146 5161                  spa_remove(spa);
5147 5162                  mutex_exit(&spa_namespace_lock);
5148 5163                  return (error);
5149 5164          }
5150 5165  
5151 5166          spa_async_resume(spa);
5152 5167  
5153 5168          /*
5154 5169           * Override any spares and level 2 cache devices as specified by
5155 5170           * the user, as these may have correct device names/devids, etc.
5156 5171           */
5157 5172          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
5158 5173              &spares, &nspares) == 0) {
5159 5174                  if (spa->spa_spares.sav_config)
5160 5175                          VERIFY(nvlist_remove(spa->spa_spares.sav_config,
5161 5176                              ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
5162 5177                  else
5163 5178                          VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
5164 5179                              NV_UNIQUE_NAME, KM_SLEEP) == 0);
5165 5180                  VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
5166 5181                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
5167 5182                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5168 5183                  spa_load_spares(spa);
5169 5184                  spa_config_exit(spa, SCL_ALL, FTAG);
5170 5185                  spa->spa_spares.sav_sync = B_TRUE;
5171 5186          }
5172 5187          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
5173 5188              &l2cache, &nl2cache) == 0) {
5174 5189                  if (spa->spa_l2cache.sav_config)
5175 5190                          VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
5176 5191                              ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
5177 5192                  else
5178 5193                          VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
5179 5194                              NV_UNIQUE_NAME, KM_SLEEP) == 0);
5180 5195                  VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
5181 5196                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
5182 5197                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5183 5198                  spa_load_l2cache(spa);
5184 5199                  spa_config_exit(spa, SCL_ALL, FTAG);
5185 5200                  spa->spa_l2cache.sav_sync = B_TRUE;
5186 5201          }
5187 5202  
5188 5203          /*
5189 5204           * Check for any removed devices.
5190 5205           */
5191 5206          if (spa->spa_autoreplace) {
5192 5207                  spa_aux_check_removed(&spa->spa_spares);
5193 5208                  spa_aux_check_removed(&spa->spa_l2cache);
5194 5209          }
5195 5210  
5196 5211          if (spa_writeable(spa)) {
5197 5212                  /*
5198 5213                   * Update the config cache to include the newly-imported pool.
5199 5214                   */
5200 5215                  spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5201 5216          }
5202 5217  
5203 5218          /*
5204 5219           * It's possible that the pool was expanded while it was exported.
5205 5220           * We kick off an async task to handle this for us.
5206 5221           */
5207 5222          spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
5208 5223  
5209 5224          spa_history_log_version(spa, "import");
5210 5225  
5211 5226          spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
5212 5227  
5213 5228          mutex_exit(&spa_namespace_lock);
5214 5229  
5215 5230          return (0);
5216 5231  }
5217 5232  
5218 5233  nvlist_t *
5219 5234  spa_tryimport(nvlist_t *tryconfig)
5220 5235  {
5221 5236          nvlist_t *config = NULL;
5222 5237          char *poolname, *cachefile;
5223 5238          spa_t *spa;
5224 5239          uint64_t state;
5225 5240          int error;
5226 5241          zpool_load_policy_t policy;
5227 5242  
5228 5243          if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
5229 5244                  return (NULL);
5230 5245  
5231 5246          if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
5232 5247                  return (NULL);
5233 5248  
5234 5249          /*
5235 5250           * Create and initialize the spa structure.
5236 5251           */
5237 5252          mutex_enter(&spa_namespace_lock);
5238 5253          spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
5239 5254          spa_activate(spa, FREAD);
5240 5255  
5241 5256          /*
5242 5257           * Rewind pool if a max txg was provided.
5243 5258           */
5244 5259          zpool_get_load_policy(spa->spa_config, &policy);
5245 5260          if (policy.zlp_txg != UINT64_MAX) {
5246 5261                  spa->spa_load_max_txg = policy.zlp_txg;
5247 5262                  spa->spa_extreme_rewind = B_TRUE;
5248 5263                  zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
5249 5264                      poolname, (longlong_t)policy.zlp_txg);
5250 5265          } else {
5251 5266                  zfs_dbgmsg("spa_tryimport: importing %s", poolname);
5252 5267          }
5253 5268  
5254 5269          if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
5255 5270              == 0) {
5256 5271                  zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
5257 5272                  spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
5258 5273          } else {
5259 5274                  spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
5260 5275          }
5261 5276  
5262 5277          error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
5263 5278  
5264 5279          /*
5265 5280           * If 'tryconfig' was at least parsable, return the current config.
5266 5281           */
5267 5282          if (spa->spa_root_vdev != NULL) {
5268 5283                  config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
5269 5284                  VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
5270 5285                      poolname) == 0);
5271 5286                  VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
5272 5287                      state) == 0);
5273 5288                  VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
5274 5289                      spa->spa_uberblock.ub_timestamp) == 0);
5275 5290                  VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
5276 5291                      spa->spa_load_info) == 0);
5277 5292  
5278 5293                  /*
5279 5294                   * If the bootfs property exists on this pool then we
5280 5295                   * copy it out so that external consumers can tell which
5281 5296                   * pools are bootable.
5282 5297                   */
5283 5298                  if ((!error || error == EEXIST) && spa->spa_bootfs) {
5284 5299                          char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5285 5300  
5286 5301                          /*
5287 5302                           * We have to play games with the name since the
5288 5303                           * pool was opened as TRYIMPORT_NAME.
5289 5304                           */
5290 5305                          if (dsl_dsobj_to_dsname(spa_name(spa),
5291 5306                              spa->spa_bootfs, tmpname) == 0) {
5292 5307                                  char *cp;
5293 5308                                  char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
5294 5309  
5295 5310                                  cp = strchr(tmpname, '/');
5296 5311                                  if (cp == NULL) {
5297 5312                                          (void) strlcpy(dsname, tmpname,
5298 5313                                              MAXPATHLEN);
5299 5314                                  } else {
5300 5315                                          (void) snprintf(dsname, MAXPATHLEN,
5301 5316                                              "%s/%s", poolname, ++cp);
5302 5317                                  }
5303 5318                                  VERIFY(nvlist_add_string(config,
5304 5319                                      ZPOOL_CONFIG_BOOTFS, dsname) == 0);
5305 5320                                  kmem_free(dsname, MAXPATHLEN);
5306 5321                          }
5307 5322                          kmem_free(tmpname, MAXPATHLEN);
5308 5323                  }
5309 5324  
5310 5325                  /*
5311 5326                   * Add the list of hot spares and level 2 cache devices.
5312 5327                   */
5313 5328                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5314 5329                  spa_add_spares(spa, config);
5315 5330                  spa_add_l2cache(spa, config);
5316 5331                  spa_config_exit(spa, SCL_CONFIG, FTAG);
5317 5332          }
5318 5333  
5319 5334          spa_unload(spa);
5320 5335          spa_deactivate(spa);
5321 5336          spa_remove(spa);
5322 5337          mutex_exit(&spa_namespace_lock);
5323 5338  
5324 5339          return (config);
5325 5340  }
5326 5341  
5327 5342  /*
5328 5343   * Pool export/destroy
5329 5344   *
5330 5345   * The act of destroying or exporting a pool is very simple.  We make sure there
5331 5346   * is no more pending I/O and any references to the pool are gone.  Then, we
5332 5347   * update the pool state and sync all the labels to disk, removing the
5333 5348   * configuration from the cache afterwards. If the 'hardforce' flag is set, then
5334 5349   * we don't sync the labels or remove the configuration cache.
5335 5350   */
5336 5351  static int
5337 5352  spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
5338 5353      boolean_t force, boolean_t hardforce)
5339 5354  {
5340 5355          spa_t *spa;
5341 5356  
5342 5357          if (oldconfig)
5343 5358                  *oldconfig = NULL;
5344 5359  
5345 5360          if (!(spa_mode_global & FWRITE))
5346 5361                  return (SET_ERROR(EROFS));
5347 5362  
5348 5363          mutex_enter(&spa_namespace_lock);
5349 5364          if ((spa = spa_lookup(pool)) == NULL) {
5350 5365                  mutex_exit(&spa_namespace_lock);
5351 5366                  return (SET_ERROR(ENOENT));
5352 5367          }
5353 5368  
5354 5369          /*
5355 5370           * Put a hold on the pool, drop the namespace lock, stop async tasks,
5356 5371           * reacquire the namespace lock, and see if we can export.
5357 5372           */
5358 5373          spa_open_ref(spa, FTAG);
5359 5374          mutex_exit(&spa_namespace_lock);
5360 5375          spa_async_suspend(spa);
5361 5376          mutex_enter(&spa_namespace_lock);
5362 5377          spa_close(spa, FTAG);
5363 5378  
5364 5379          /*
5365 5380           * The pool will be in core if it's openable,
5366 5381           * in which case we can modify its state.
5367 5382           */
5368 5383          if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
5369 5384  
5370 5385                  /*
5371 5386                   * Objsets may be open only because they're dirty, so we
5372 5387                   * have to force it to sync before checking spa_refcnt.
5373 5388                   */
5374 5389                  txg_wait_synced(spa->spa_dsl_pool, 0);
5375 5390                  spa_evicting_os_wait(spa);
5376 5391  
5377 5392                  /*
5378 5393                   * A pool cannot be exported or destroyed if there are active
5379 5394                   * references.  If we are resetting a pool, allow references by
5380 5395                   * fault injection handlers.
5381 5396                   */
5382 5397                  if (!spa_refcount_zero(spa) ||
5383 5398                      (spa->spa_inject_ref != 0 &&
5384 5399                      new_state != POOL_STATE_UNINITIALIZED)) {
5385 5400                          spa_async_resume(spa);
5386 5401                          mutex_exit(&spa_namespace_lock);
5387 5402                          return (SET_ERROR(EBUSY));
5388 5403                  }
5389 5404  
5390 5405                  /*
5391 5406                   * A pool cannot be exported if it has an active shared spare.
5392 5407                   * This is to prevent other pools stealing the active spare
5393 5408                   * from an exported pool. At user's own will, such pool can
5394 5409                   * be forcedly exported.
5395 5410                   */
5396 5411                  if (!force && new_state == POOL_STATE_EXPORTED &&
5397 5412                      spa_has_active_shared_spare(spa)) {
5398 5413                          spa_async_resume(spa);
5399 5414                          mutex_exit(&spa_namespace_lock);
5400 5415                          return (SET_ERROR(EXDEV));
5401 5416                  }
5402 5417  
5403 5418                  /*
5404 5419                   * We're about to export or destroy this pool. Make sure
5405 5420                   * we stop all initializtion activity here before we
5406 5421                   * set the spa_final_txg. This will ensure that all
5407 5422                   * dirty data resulting from the initialization is
5408 5423                   * committed to disk before we unload the pool.
5409 5424                   */
5410 5425                  if (spa->spa_root_vdev != NULL) {
5411 5426                          vdev_initialize_stop_all(spa->spa_root_vdev,
5412 5427                              VDEV_INITIALIZE_ACTIVE);
5413 5428                  }
5414 5429  
5415 5430                  /*
5416 5431                   * We want this to be reflected on every label,
5417 5432                   * so mark them all dirty.  spa_unload() will do the
5418 5433                   * final sync that pushes these changes out.
5419 5434                   */
5420 5435                  if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
5421 5436                          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5422 5437                          spa->spa_state = new_state;
5423 5438                          spa->spa_final_txg = spa_last_synced_txg(spa) +
5424 5439                              TXG_DEFER_SIZE + 1;
5425 5440                          vdev_config_dirty(spa->spa_root_vdev);
5426 5441                          spa_config_exit(spa, SCL_ALL, FTAG);
5427 5442                  }
5428 5443          }
5429 5444  
5430 5445          spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
5431 5446  
5432 5447          if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5433 5448                  spa_unload(spa);
5434 5449                  spa_deactivate(spa);
5435 5450          }
5436 5451  
5437 5452          if (oldconfig && spa->spa_config)
5438 5453                  VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
5439 5454  
5440 5455          if (new_state != POOL_STATE_UNINITIALIZED) {
5441 5456                  if (!hardforce)
5442 5457                          spa_write_cachefile(spa, B_TRUE, B_TRUE);
5443 5458                  spa_remove(spa);
5444 5459          }
5445 5460          mutex_exit(&spa_namespace_lock);
5446 5461  
5447 5462          return (0);
5448 5463  }
5449 5464  
5450 5465  /*
5451 5466   * Destroy a storage pool.
5452 5467   */
5453 5468  int
5454 5469  spa_destroy(char *pool)
5455 5470  {
5456 5471          return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
5457 5472              B_FALSE, B_FALSE));
5458 5473  }
5459 5474  
5460 5475  /*
5461 5476   * Export a storage pool.
5462 5477   */
5463 5478  int
5464 5479  spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
5465 5480      boolean_t hardforce)
5466 5481  {
5467 5482          return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
5468 5483              force, hardforce));
5469 5484  }
5470 5485  
5471 5486  /*
5472 5487   * Similar to spa_export(), this unloads the spa_t without actually removing it
5473 5488   * from the namespace in any way.
5474 5489   */
5475 5490  int
5476 5491  spa_reset(char *pool)
5477 5492  {
5478 5493          return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
5479 5494              B_FALSE, B_FALSE));
5480 5495  }
5481 5496  
5482 5497  /*
5483 5498   * ==========================================================================
5484 5499   * Device manipulation
5485 5500   * ==========================================================================
5486 5501   */
5487 5502  
5488 5503  /*
5489 5504   * Add a device to a storage pool.
5490 5505   */
5491 5506  int
5492 5507  spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
5493 5508  {
5494 5509          uint64_t txg, id;
5495 5510          int error;
5496 5511          vdev_t *rvd = spa->spa_root_vdev;
5497 5512          vdev_t *vd, *tvd;
5498 5513          nvlist_t **spares, **l2cache;
5499 5514          uint_t nspares, nl2cache;
5500 5515  
5501 5516          ASSERT(spa_writeable(spa));
5502 5517  
5503 5518          txg = spa_vdev_enter(spa);
5504 5519  
5505 5520          if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
5506 5521              VDEV_ALLOC_ADD)) != 0)
5507 5522                  return (spa_vdev_exit(spa, NULL, txg, error));
5508 5523  
5509 5524          spa->spa_pending_vdev = vd;     /* spa_vdev_exit() will clear this */
5510 5525  
5511 5526          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
5512 5527              &nspares) != 0)
5513 5528                  nspares = 0;
5514 5529  
5515 5530          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
5516 5531              &nl2cache) != 0)
5517 5532                  nl2cache = 0;
5518 5533  
5519 5534          if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
5520 5535                  return (spa_vdev_exit(spa, vd, txg, EINVAL));
5521 5536  
5522 5537          if (vd->vdev_children != 0 &&
5523 5538              (error = vdev_create(vd, txg, B_FALSE)) != 0)
5524 5539                  return (spa_vdev_exit(spa, vd, txg, error));
5525 5540  
5526 5541          /*
5527 5542           * We must validate the spares and l2cache devices after checking the
5528 5543           * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
5529 5544           */
5530 5545          if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
5531 5546                  return (spa_vdev_exit(spa, vd, txg, error));
5532 5547  
5533 5548          /*
5534 5549           * If we are in the middle of a device removal, we can only add
5535 5550           * devices which match the existing devices in the pool.
5536 5551           * If we are in the middle of a removal, or have some indirect
5537 5552           * vdevs, we can not add raidz toplevels.
5538 5553           */
5539 5554          if (spa->spa_vdev_removal != NULL ||
5540 5555              spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
5541 5556                  for (int c = 0; c < vd->vdev_children; c++) {
5542 5557                          tvd = vd->vdev_child[c];
5543 5558                          if (spa->spa_vdev_removal != NULL &&
5544 5559                              tvd->vdev_ashift != spa->spa_max_ashift) {
5545 5560                                  return (spa_vdev_exit(spa, vd, txg, EINVAL));
5546 5561                          }
5547 5562                          /* Fail if top level vdev is raidz */
5548 5563                          if (tvd->vdev_ops == &vdev_raidz_ops) {
5549 5564                                  return (spa_vdev_exit(spa, vd, txg, EINVAL));
5550 5565                          }
5551 5566                          /*
5552 5567                           * Need the top level mirror to be
5553 5568                           * a mirror of leaf vdevs only
5554 5569                           */
5555 5570                          if (tvd->vdev_ops == &vdev_mirror_ops) {
5556 5571                                  for (uint64_t cid = 0;
5557 5572                                      cid < tvd->vdev_children; cid++) {
5558 5573                                          vdev_t *cvd = tvd->vdev_child[cid];
5559 5574                                          if (!cvd->vdev_ops->vdev_op_leaf) {
5560 5575                                                  return (spa_vdev_exit(spa, vd,
5561 5576                                                      txg, EINVAL));
5562 5577                                          }
5563 5578                                  }
5564 5579                          }
5565 5580                  }
5566 5581          }
5567 5582  
5568 5583          for (int c = 0; c < vd->vdev_children; c++) {
5569 5584  
5570 5585                  /*
5571 5586                   * Set the vdev id to the first hole, if one exists.
5572 5587                   */
5573 5588                  for (id = 0; id < rvd->vdev_children; id++) {
5574 5589                          if (rvd->vdev_child[id]->vdev_ishole) {
5575 5590                                  vdev_free(rvd->vdev_child[id]);
5576 5591                                  break;
5577 5592                          }
5578 5593                  }
5579 5594                  tvd = vd->vdev_child[c];
5580 5595                  vdev_remove_child(vd, tvd);
5581 5596                  tvd->vdev_id = id;
5582 5597                  vdev_add_child(rvd, tvd);
5583 5598                  vdev_config_dirty(tvd);
5584 5599          }
5585 5600  
5586 5601          if (nspares != 0) {
5587 5602                  spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
5588 5603                      ZPOOL_CONFIG_SPARES);
5589 5604                  spa_load_spares(spa);
5590 5605                  spa->spa_spares.sav_sync = B_TRUE;
5591 5606          }
5592 5607  
5593 5608          if (nl2cache != 0) {
5594 5609                  spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
5595 5610                      ZPOOL_CONFIG_L2CACHE);
5596 5611                  spa_load_l2cache(spa);
5597 5612                  spa->spa_l2cache.sav_sync = B_TRUE;
5598 5613          }
5599 5614  
5600 5615          /*
5601 5616           * We have to be careful when adding new vdevs to an existing pool.
5602 5617           * If other threads start allocating from these vdevs before we
5603 5618           * sync the config cache, and we lose power, then upon reboot we may
5604 5619           * fail to open the pool because there are DVAs that the config cache
5605 5620           * can't translate.  Therefore, we first add the vdevs without
5606 5621           * initializing metaslabs; sync the config cache (via spa_vdev_exit());
5607 5622           * and then let spa_config_update() initialize the new metaslabs.
5608 5623           *
5609 5624           * spa_load() checks for added-but-not-initialized vdevs, so that
5610 5625           * if we lose power at any point in this sequence, the remaining
5611 5626           * steps will be completed the next time we load the pool.
5612 5627           */
5613 5628          (void) spa_vdev_exit(spa, vd, txg, 0);
5614 5629  
5615 5630          mutex_enter(&spa_namespace_lock);
5616 5631          spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5617 5632          spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
5618 5633          mutex_exit(&spa_namespace_lock);
5619 5634  
5620 5635          return (0);
5621 5636  }
5622 5637  
5623 5638  /*
5624 5639   * Attach a device to a mirror.  The arguments are the path to any device
5625 5640   * in the mirror, and the nvroot for the new device.  If the path specifies
5626 5641   * a device that is not mirrored, we automatically insert the mirror vdev.
5627 5642   *
5628 5643   * If 'replacing' is specified, the new device is intended to replace the
5629 5644   * existing device; in this case the two devices are made into their own
5630 5645   * mirror using the 'replacing' vdev, which is functionally identical to
5631 5646   * the mirror vdev (it actually reuses all the same ops) but has a few
5632 5647   * extra rules: you can't attach to it after it's been created, and upon
5633 5648   * completion of resilvering, the first disk (the one being replaced)
5634 5649   * is automatically detached.
5635 5650   */
5636 5651  int
5637 5652  spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
5638 5653  {
5639 5654          uint64_t txg, dtl_max_txg;
5640 5655          vdev_t *rvd = spa->spa_root_vdev;
5641 5656          vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
5642 5657          vdev_ops_t *pvops;
5643 5658          char *oldvdpath, *newvdpath;
5644 5659          int newvd_isspare;
5645 5660          int error;
5646 5661  
5647 5662          ASSERT(spa_writeable(spa));
5648 5663  
5649 5664          txg = spa_vdev_enter(spa);
5650 5665  
5651 5666          oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
5652 5667  
5653 5668          ASSERT(MUTEX_HELD(&spa_namespace_lock));
5654 5669          if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
5655 5670                  error = (spa_has_checkpoint(spa)) ?
5656 5671                      ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
5657 5672                  return (spa_vdev_exit(spa, NULL, txg, error));
5658 5673          }
5659 5674  
5660 5675          if (spa->spa_vdev_removal != NULL)
5661 5676                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5662 5677  
5663 5678          if (oldvd == NULL)
5664 5679                  return (spa_vdev_exit(spa, NULL, txg, ENODEV));
5665 5680  
5666 5681          if (!oldvd->vdev_ops->vdev_op_leaf)
5667 5682                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5668 5683  
5669 5684          pvd = oldvd->vdev_parent;
5670 5685  
5671 5686          if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
5672 5687              VDEV_ALLOC_ATTACH)) != 0)
5673 5688                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
5674 5689  
5675 5690          if (newrootvd->vdev_children != 1)
5676 5691                  return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
5677 5692  
5678 5693          newvd = newrootvd->vdev_child[0];
5679 5694  
5680 5695          if (!newvd->vdev_ops->vdev_op_leaf)
5681 5696                  return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
5682 5697  
5683 5698          if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
5684 5699                  return (spa_vdev_exit(spa, newrootvd, txg, error));
5685 5700  
5686 5701          /*
5687 5702           * Spares can't replace logs
5688 5703           */
5689 5704          if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
5690 5705                  return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5691 5706  
5692 5707          if (!replacing) {
5693 5708                  /*
5694 5709                   * For attach, the only allowable parent is a mirror or the root
5695 5710                   * vdev.
5696 5711                   */
5697 5712                  if (pvd->vdev_ops != &vdev_mirror_ops &&
5698 5713                      pvd->vdev_ops != &vdev_root_ops)
5699 5714                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5700 5715  
5701 5716                  pvops = &vdev_mirror_ops;
5702 5717          } else {
5703 5718                  /*
5704 5719                   * Active hot spares can only be replaced by inactive hot
5705 5720                   * spares.
5706 5721                   */
5707 5722                  if (pvd->vdev_ops == &vdev_spare_ops &&
5708 5723                      oldvd->vdev_isspare &&
5709 5724                      !spa_has_spare(spa, newvd->vdev_guid))
5710 5725                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5711 5726  
5712 5727                  /*
5713 5728                   * If the source is a hot spare, and the parent isn't already a
5714 5729                   * spare, then we want to create a new hot spare.  Otherwise, we
5715 5730                   * want to create a replacing vdev.  The user is not allowed to
5716 5731                   * attach to a spared vdev child unless the 'isspare' state is
5717 5732                   * the same (spare replaces spare, non-spare replaces
5718 5733                   * non-spare).
5719 5734                   */
5720 5735                  if (pvd->vdev_ops == &vdev_replacing_ops &&
5721 5736                      spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
5722 5737                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5723 5738                  } else if (pvd->vdev_ops == &vdev_spare_ops &&
5724 5739                      newvd->vdev_isspare != oldvd->vdev_isspare) {
5725 5740                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
5726 5741                  }
5727 5742  
5728 5743                  if (newvd->vdev_isspare)
5729 5744                          pvops = &vdev_spare_ops;
5730 5745                  else
5731 5746                          pvops = &vdev_replacing_ops;
5732 5747          }
5733 5748  
5734 5749          /*
5735 5750           * Make sure the new device is big enough.
5736 5751           */
5737 5752          if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
5738 5753                  return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
5739 5754  
5740 5755          /*
5741 5756           * The new device cannot have a higher alignment requirement
5742 5757           * than the top-level vdev.
5743 5758           */
5744 5759          if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
5745 5760                  return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
5746 5761  
5747 5762          /*
5748 5763           * If this is an in-place replacement, update oldvd's path and devid
5749 5764           * to make it distinguishable from newvd, and unopenable from now on.
5750 5765           */
5751 5766          if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
5752 5767                  spa_strfree(oldvd->vdev_path);
5753 5768                  oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
5754 5769                      KM_SLEEP);
5755 5770                  (void) sprintf(oldvd->vdev_path, "%s/%s",
5756 5771                      newvd->vdev_path, "old");
5757 5772                  if (oldvd->vdev_devid != NULL) {
5758 5773                          spa_strfree(oldvd->vdev_devid);
5759 5774                          oldvd->vdev_devid = NULL;
5760 5775                  }
5761 5776          }
5762 5777  
5763 5778          /* mark the device being resilvered */
5764 5779          newvd->vdev_resilver_txg = txg;
5765 5780  
5766 5781          /*
5767 5782           * If the parent is not a mirror, or if we're replacing, insert the new
5768 5783           * mirror/replacing/spare vdev above oldvd.
5769 5784           */
5770 5785          if (pvd->vdev_ops != pvops)
5771 5786                  pvd = vdev_add_parent(oldvd, pvops);
5772 5787  
5773 5788          ASSERT(pvd->vdev_top->vdev_parent == rvd);
5774 5789          ASSERT(pvd->vdev_ops == pvops);
5775 5790          ASSERT(oldvd->vdev_parent == pvd);
5776 5791  
5777 5792          /*
5778 5793           * Extract the new device from its root and add it to pvd.
5779 5794           */
5780 5795          vdev_remove_child(newrootvd, newvd);
5781 5796          newvd->vdev_id = pvd->vdev_children;
5782 5797          newvd->vdev_crtxg = oldvd->vdev_crtxg;
5783 5798          vdev_add_child(pvd, newvd);
5784 5799  
5785 5800          tvd = newvd->vdev_top;
5786 5801          ASSERT(pvd->vdev_top == tvd);
5787 5802          ASSERT(tvd->vdev_parent == rvd);
5788 5803  
5789 5804          vdev_config_dirty(tvd);
5790 5805  
5791 5806          /*
5792 5807           * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
5793 5808           * for any dmu_sync-ed blocks.  It will propagate upward when
5794 5809           * spa_vdev_exit() calls vdev_dtl_reassess().
5795 5810           */
5796 5811          dtl_max_txg = txg + TXG_CONCURRENT_STATES;
5797 5812  
5798 5813          vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
5799 5814              dtl_max_txg - TXG_INITIAL);
5800 5815  
5801 5816          if (newvd->vdev_isspare) {
5802 5817                  spa_spare_activate(newvd);
5803 5818                  spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
5804 5819          }
5805 5820  
5806 5821          oldvdpath = spa_strdup(oldvd->vdev_path);
5807 5822          newvdpath = spa_strdup(newvd->vdev_path);
5808 5823          newvd_isspare = newvd->vdev_isspare;
5809 5824  
5810 5825          /*
5811 5826           * Mark newvd's DTL dirty in this txg.
5812 5827           */
5813 5828          vdev_dirty(tvd, VDD_DTL, newvd, txg);
5814 5829  
5815 5830          /*
5816 5831           * Schedule the resilver to restart in the future. We do this to
5817 5832           * ensure that dmu_sync-ed blocks have been stitched into the
5818 5833           * respective datasets.
5819 5834           */
5820 5835          dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
5821 5836  
5822 5837          if (spa->spa_bootfs)
5823 5838                  spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
5824 5839  
5825 5840          spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
5826 5841  
5827 5842          /*
5828 5843           * Commit the config
5829 5844           */
5830 5845          (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
5831 5846  
5832 5847          spa_history_log_internal(spa, "vdev attach", NULL,
5833 5848              "%s vdev=%s %s vdev=%s",
5834 5849              replacing && newvd_isspare ? "spare in" :
5835 5850              replacing ? "replace" : "attach", newvdpath,
5836 5851              replacing ? "for" : "to", oldvdpath);
5837 5852  
5838 5853          spa_strfree(oldvdpath);
5839 5854          spa_strfree(newvdpath);
5840 5855  
5841 5856          return (0);
5842 5857  }
5843 5858  
5844 5859  /*
5845 5860   * Detach a device from a mirror or replacing vdev.
5846 5861   *
5847 5862   * If 'replace_done' is specified, only detach if the parent
5848 5863   * is a replacing vdev.
5849 5864   */
5850 5865  int
5851 5866  spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
5852 5867  {
5853 5868          uint64_t txg;
5854 5869          int error;
5855 5870          vdev_t *rvd = spa->spa_root_vdev;
5856 5871          vdev_t *vd, *pvd, *cvd, *tvd;
5857 5872          boolean_t unspare = B_FALSE;
5858 5873          uint64_t unspare_guid = 0;
5859 5874          char *vdpath;
5860 5875  
5861 5876          ASSERT(spa_writeable(spa));
5862 5877  
5863 5878          txg = spa_vdev_enter(spa);
5864 5879  
5865 5880          vd = spa_lookup_by_guid(spa, guid, B_FALSE);
5866 5881  
5867 5882          /*
5868 5883           * Besides being called directly from the userland through the
5869 5884           * ioctl interface, spa_vdev_detach() can be potentially called
5870 5885           * at the end of spa_vdev_resilver_done().
5871 5886           *
5872 5887           * In the regular case, when we have a checkpoint this shouldn't
5873 5888           * happen as we never empty the DTLs of a vdev during the scrub
5874 5889           * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
5875 5890           * should never get here when we have a checkpoint.
5876 5891           *
5877 5892           * That said, even in a case when we checkpoint the pool exactly
5878 5893           * as spa_vdev_resilver_done() calls this function everything
5879 5894           * should be fine as the resilver will return right away.
5880 5895           */
5881 5896          ASSERT(MUTEX_HELD(&spa_namespace_lock));
5882 5897          if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
5883 5898                  error = (spa_has_checkpoint(spa)) ?
5884 5899                      ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
5885 5900                  return (spa_vdev_exit(spa, NULL, txg, error));
5886 5901          }
5887 5902  
5888 5903          if (vd == NULL)
5889 5904                  return (spa_vdev_exit(spa, NULL, txg, ENODEV));
5890 5905  
5891 5906          if (!vd->vdev_ops->vdev_op_leaf)
5892 5907                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5893 5908  
5894 5909          pvd = vd->vdev_parent;
5895 5910  
5896 5911          /*
5897 5912           * If the parent/child relationship is not as expected, don't do it.
5898 5913           * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
5899 5914           * vdev that's replacing B with C.  The user's intent in replacing
5900 5915           * is to go from M(A,B) to M(A,C).  If the user decides to cancel
5901 5916           * the replace by detaching C, the expected behavior is to end up
5902 5917           * M(A,B).  But suppose that right after deciding to detach C,
5903 5918           * the replacement of B completes.  We would have M(A,C), and then
5904 5919           * ask to detach C, which would leave us with just A -- not what
5905 5920           * the user wanted.  To prevent this, we make sure that the
5906 5921           * parent/child relationship hasn't changed -- in this example,
5907 5922           * that C's parent is still the replacing vdev R.
5908 5923           */
5909 5924          if (pvd->vdev_guid != pguid && pguid != 0)
5910 5925                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5911 5926  
5912 5927          /*
5913 5928           * Only 'replacing' or 'spare' vdevs can be replaced.
5914 5929           */
5915 5930          if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
5916 5931              pvd->vdev_ops != &vdev_spare_ops)
5917 5932                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5918 5933  
5919 5934          ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
5920 5935              spa_version(spa) >= SPA_VERSION_SPARES);
5921 5936  
5922 5937          /*
5923 5938           * Only mirror, replacing, and spare vdevs support detach.
5924 5939           */
5925 5940          if (pvd->vdev_ops != &vdev_replacing_ops &&
5926 5941              pvd->vdev_ops != &vdev_mirror_ops &&
5927 5942              pvd->vdev_ops != &vdev_spare_ops)
5928 5943                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
5929 5944  
5930 5945          /*
5931 5946           * If this device has the only valid copy of some data,
5932 5947           * we cannot safely detach it.
5933 5948           */
5934 5949          if (vdev_dtl_required(vd))
5935 5950                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
5936 5951  
5937 5952          ASSERT(pvd->vdev_children >= 2);
5938 5953  
5939 5954          /*
5940 5955           * If we are detaching the second disk from a replacing vdev, then
5941 5956           * check to see if we changed the original vdev's path to have "/old"
5942 5957           * at the end in spa_vdev_attach().  If so, undo that change now.
5943 5958           */
5944 5959          if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
5945 5960              vd->vdev_path != NULL) {
5946 5961                  size_t len = strlen(vd->vdev_path);
5947 5962  
5948 5963                  for (int c = 0; c < pvd->vdev_children; c++) {
5949 5964                          cvd = pvd->vdev_child[c];
5950 5965  
5951 5966                          if (cvd == vd || cvd->vdev_path == NULL)
5952 5967                                  continue;
5953 5968  
5954 5969                          if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
5955 5970                              strcmp(cvd->vdev_path + len, "/old") == 0) {
5956 5971                                  spa_strfree(cvd->vdev_path);
5957 5972                                  cvd->vdev_path = spa_strdup(vd->vdev_path);
5958 5973                                  break;
5959 5974                          }
5960 5975                  }
5961 5976          }
5962 5977  
5963 5978          /*
5964 5979           * If we are detaching the original disk from a spare, then it implies
5965 5980           * that the spare should become a real disk, and be removed from the
5966 5981           * active spare list for the pool.
5967 5982           */
5968 5983          if (pvd->vdev_ops == &vdev_spare_ops &&
5969 5984              vd->vdev_id == 0 &&
5970 5985              pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
5971 5986                  unspare = B_TRUE;
5972 5987  
5973 5988          /*
5974 5989           * Erase the disk labels so the disk can be used for other things.
5975 5990           * This must be done after all other error cases are handled,
5976 5991           * but before we disembowel vd (so we can still do I/O to it).
5977 5992           * But if we can't do it, don't treat the error as fatal --
5978 5993           * it may be that the unwritability of the disk is the reason
5979 5994           * it's being detached!
5980 5995           */
5981 5996          error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
5982 5997  
5983 5998          /*
5984 5999           * Remove vd from its parent and compact the parent's children.
5985 6000           */
5986 6001          vdev_remove_child(pvd, vd);
5987 6002          vdev_compact_children(pvd);
5988 6003  
5989 6004          /*
5990 6005           * Remember one of the remaining children so we can get tvd below.
5991 6006           */
5992 6007          cvd = pvd->vdev_child[pvd->vdev_children - 1];
5993 6008  
5994 6009          /*
5995 6010           * If we need to remove the remaining child from the list of hot spares,
5996 6011           * do it now, marking the vdev as no longer a spare in the process.
5997 6012           * We must do this before vdev_remove_parent(), because that can
5998 6013           * change the GUID if it creates a new toplevel GUID.  For a similar
5999 6014           * reason, we must remove the spare now, in the same txg as the detach;
6000 6015           * otherwise someone could attach a new sibling, change the GUID, and
6001 6016           * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
6002 6017           */
6003 6018          if (unspare) {
6004 6019                  ASSERT(cvd->vdev_isspare);
6005 6020                  spa_spare_remove(cvd);
6006 6021                  unspare_guid = cvd->vdev_guid;
6007 6022                  (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
6008 6023                  cvd->vdev_unspare = B_TRUE;
6009 6024          }
6010 6025  
6011 6026          /*
6012 6027           * If the parent mirror/replacing vdev only has one child,
6013 6028           * the parent is no longer needed.  Remove it from the tree.
6014 6029           */
6015 6030          if (pvd->vdev_children == 1) {
6016 6031                  if (pvd->vdev_ops == &vdev_spare_ops)
6017 6032                          cvd->vdev_unspare = B_FALSE;
6018 6033                  vdev_remove_parent(cvd);
6019 6034          }
6020 6035  
6021 6036  
6022 6037          /*
6023 6038           * We don't set tvd until now because the parent we just removed
6024 6039           * may have been the previous top-level vdev.
6025 6040           */
6026 6041          tvd = cvd->vdev_top;
6027 6042          ASSERT(tvd->vdev_parent == rvd);
6028 6043  
6029 6044          /*
6030 6045           * Reevaluate the parent vdev state.
6031 6046           */
6032 6047          vdev_propagate_state(cvd);
6033 6048  
6034 6049          /*
6035 6050           * If the 'autoexpand' property is set on the pool then automatically
6036 6051           * try to expand the size of the pool. For example if the device we
6037 6052           * just detached was smaller than the others, it may be possible to
6038 6053           * add metaslabs (i.e. grow the pool). We need to reopen the vdev
6039 6054           * first so that we can obtain the updated sizes of the leaf vdevs.
6040 6055           */
6041 6056          if (spa->spa_autoexpand) {
6042 6057                  vdev_reopen(tvd);
6043 6058                  vdev_expand(tvd, txg);
6044 6059          }
6045 6060  
6046 6061          vdev_config_dirty(tvd);
6047 6062  
6048 6063          /*
6049 6064           * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
6050 6065           * vd->vdev_detached is set and free vd's DTL object in syncing context.
6051 6066           * But first make sure we're not on any *other* txg's DTL list, to
6052 6067           * prevent vd from being accessed after it's freed.
6053 6068           */
6054 6069          vdpath = spa_strdup(vd->vdev_path);
6055 6070          for (int t = 0; t < TXG_SIZE; t++)
6056 6071                  (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
6057 6072          vd->vdev_detached = B_TRUE;
6058 6073          vdev_dirty(tvd, VDD_DTL, vd, txg);
6059 6074  
6060 6075          spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
6061 6076  
6062 6077          /* hang on to the spa before we release the lock */
6063 6078          spa_open_ref(spa, FTAG);
6064 6079  
6065 6080          error = spa_vdev_exit(spa, vd, txg, 0);
6066 6081  
6067 6082          spa_history_log_internal(spa, "detach", NULL,
6068 6083              "vdev=%s", vdpath);
6069 6084          spa_strfree(vdpath);
6070 6085  
6071 6086          /*
6072 6087           * If this was the removal of the original device in a hot spare vdev,
6073 6088           * then we want to go through and remove the device from the hot spare
6074 6089           * list of every other pool.
6075 6090           */
6076 6091          if (unspare) {
6077 6092                  spa_t *altspa = NULL;
6078 6093  
6079 6094                  mutex_enter(&spa_namespace_lock);
6080 6095                  while ((altspa = spa_next(altspa)) != NULL) {
6081 6096                          if (altspa->spa_state != POOL_STATE_ACTIVE ||
6082 6097                              altspa == spa)
6083 6098                                  continue;
6084 6099  
6085 6100                          spa_open_ref(altspa, FTAG);
6086 6101                          mutex_exit(&spa_namespace_lock);
6087 6102                          (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
6088 6103                          mutex_enter(&spa_namespace_lock);
6089 6104                          spa_close(altspa, FTAG);
6090 6105                  }
6091 6106                  mutex_exit(&spa_namespace_lock);
6092 6107  
6093 6108                  /* search the rest of the vdevs for spares to remove */
6094 6109                  spa_vdev_resilver_done(spa);
6095 6110          }
6096 6111  
6097 6112          /* all done with the spa; OK to release */
6098 6113          mutex_enter(&spa_namespace_lock);
6099 6114          spa_close(spa, FTAG);
6100 6115          mutex_exit(&spa_namespace_lock);
6101 6116  
6102 6117          return (error);
6103 6118  }
6104 6119  
6105 6120  int
6106 6121  spa_vdev_initialize(spa_t *spa, uint64_t guid, uint64_t cmd_type)
6107 6122  {
6108 6123          /*
6109 6124           * We hold the namespace lock through the whole function
6110 6125           * to prevent any changes to the pool while we're starting or
6111 6126           * stopping initialization. The config and state locks are held so that
6112 6127           * we can properly assess the vdev state before we commit to
6113 6128           * the initializing operation.
6114 6129           */
6115 6130          mutex_enter(&spa_namespace_lock);
6116 6131          spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
6117 6132  
6118 6133          /* Look up vdev and ensure it's a leaf. */
6119 6134          vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
6120 6135          if (vd == NULL || vd->vdev_detached) {
6121 6136                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6122 6137                  mutex_exit(&spa_namespace_lock);
6123 6138                  return (SET_ERROR(ENODEV));
6124 6139          } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
6125 6140                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6126 6141                  mutex_exit(&spa_namespace_lock);
6127 6142                  return (SET_ERROR(EINVAL));
6128 6143          } else if (!vdev_writeable(vd)) {
6129 6144                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6130 6145                  mutex_exit(&spa_namespace_lock);
6131 6146                  return (SET_ERROR(EROFS));
6132 6147          }
6133 6148          mutex_enter(&vd->vdev_initialize_lock);
6134 6149          spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
6135 6150  
6136 6151          /*
6137 6152           * When we activate an initialize action we check to see
6138 6153           * if the vdev_initialize_thread is NULL. We do this instead
6139 6154           * of using the vdev_initialize_state since there might be
6140 6155           * a previous initialization process which has completed but
6141 6156           * the thread is not exited.
6142 6157           */
6143 6158          if (cmd_type == POOL_INITIALIZE_DO &&
6144 6159              (vd->vdev_initialize_thread != NULL ||
6145 6160              vd->vdev_top->vdev_removing)) {
6146 6161                  mutex_exit(&vd->vdev_initialize_lock);
6147 6162                  mutex_exit(&spa_namespace_lock);
6148 6163                  return (SET_ERROR(EBUSY));
6149 6164          } else if (cmd_type == POOL_INITIALIZE_CANCEL &&
6150 6165              (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
6151 6166              vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
6152 6167                  mutex_exit(&vd->vdev_initialize_lock);
6153 6168                  mutex_exit(&spa_namespace_lock);
6154 6169                  return (SET_ERROR(ESRCH));
6155 6170          } else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
6156 6171              vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
6157 6172                  mutex_exit(&vd->vdev_initialize_lock);
6158 6173                  mutex_exit(&spa_namespace_lock);
6159 6174                  return (SET_ERROR(ESRCH));
6160 6175          }
6161 6176  
6162 6177          switch (cmd_type) {
6163 6178          case POOL_INITIALIZE_DO:
6164 6179                  vdev_initialize(vd);
6165 6180                  break;
6166 6181          case POOL_INITIALIZE_CANCEL:
6167 6182                  vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED);
6168 6183                  break;
6169 6184          case POOL_INITIALIZE_SUSPEND:
6170 6185                  vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED);
6171 6186                  break;
6172 6187          default:
6173 6188                  panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
6174 6189          }
6175 6190          mutex_exit(&vd->vdev_initialize_lock);
6176 6191  
6177 6192          /* Sync out the initializing state */
6178 6193          txg_wait_synced(spa->spa_dsl_pool, 0);
6179 6194          mutex_exit(&spa_namespace_lock);
6180 6195  
6181 6196          return (0);
6182 6197  }
6183 6198  
6184 6199  
6185 6200  /*
6186 6201   * Split a set of devices from their mirrors, and create a new pool from them.
6187 6202   */
6188 6203  int
6189 6204  spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
6190 6205      nvlist_t *props, boolean_t exp)
6191 6206  {
6192 6207          int error = 0;
6193 6208          uint64_t txg, *glist;
6194 6209          spa_t *newspa;
6195 6210          uint_t c, children, lastlog;
6196 6211          nvlist_t **child, *nvl, *tmp;
6197 6212          dmu_tx_t *tx;
6198 6213          char *altroot = NULL;
6199 6214          vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
6200 6215          boolean_t activate_slog;
6201 6216  
6202 6217          ASSERT(spa_writeable(spa));
6203 6218  
6204 6219          txg = spa_vdev_enter(spa);
6205 6220  
6206 6221          ASSERT(MUTEX_HELD(&spa_namespace_lock));
6207 6222          if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
6208 6223                  error = (spa_has_checkpoint(spa)) ?
6209 6224                      ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
6210 6225                  return (spa_vdev_exit(spa, NULL, txg, error));
6211 6226          }
6212 6227  
6213 6228          /* clear the log and flush everything up to now */
6214 6229          activate_slog = spa_passivate_log(spa);
6215 6230          (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
6216 6231          error = spa_reset_logs(spa);
6217 6232          txg = spa_vdev_config_enter(spa);
6218 6233  
6219 6234          if (activate_slog)
6220 6235                  spa_activate_log(spa);
6221 6236  
6222 6237          if (error != 0)
6223 6238                  return (spa_vdev_exit(spa, NULL, txg, error));
6224 6239  
6225 6240          /* check new spa name before going any further */
6226 6241          if (spa_lookup(newname) != NULL)
6227 6242                  return (spa_vdev_exit(spa, NULL, txg, EEXIST));
6228 6243  
6229 6244          /*
6230 6245           * scan through all the children to ensure they're all mirrors
6231 6246           */
6232 6247          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
6233 6248              nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
6234 6249              &children) != 0)
6235 6250                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6236 6251  
6237 6252          /* first, check to ensure we've got the right child count */
6238 6253          rvd = spa->spa_root_vdev;
6239 6254          lastlog = 0;
6240 6255          for (c = 0; c < rvd->vdev_children; c++) {
6241 6256                  vdev_t *vd = rvd->vdev_child[c];
6242 6257  
6243 6258                  /* don't count the holes & logs as children */
6244 6259                  if (vd->vdev_islog || !vdev_is_concrete(vd)) {
6245 6260                          if (lastlog == 0)
6246 6261                                  lastlog = c;
6247 6262                          continue;
6248 6263                  }
6249 6264  
6250 6265                  lastlog = 0;
6251 6266          }
6252 6267          if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
6253 6268                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6254 6269  
6255 6270          /* next, ensure no spare or cache devices are part of the split */
6256 6271          if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
6257 6272              nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
6258 6273                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
6259 6274  
6260 6275          vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
6261 6276          glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
6262 6277  
6263 6278          /* then, loop over each vdev and validate it */
6264 6279          for (c = 0; c < children; c++) {
6265 6280                  uint64_t is_hole = 0;
6266 6281  
6267 6282                  (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
6268 6283                      &is_hole);
6269 6284  
6270 6285                  if (is_hole != 0) {
6271 6286                          if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
6272 6287                              spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
6273 6288                                  continue;
6274 6289                          } else {
6275 6290                                  error = SET_ERROR(EINVAL);
6276 6291                                  break;
6277 6292                          }
6278 6293                  }
6279 6294  
6280 6295                  /* which disk is going to be split? */
6281 6296                  if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
6282 6297                      &glist[c]) != 0) {
6283 6298                          error = SET_ERROR(EINVAL);
6284 6299                          break;
6285 6300                  }
6286 6301  
6287 6302                  /* look it up in the spa */
6288 6303                  vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
6289 6304                  if (vml[c] == NULL) {
6290 6305                          error = SET_ERROR(ENODEV);
6291 6306                          break;
6292 6307                  }
6293 6308  
6294 6309                  /* make sure there's nothing stopping the split */
6295 6310                  if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
6296 6311                      vml[c]->vdev_islog ||
6297 6312                      !vdev_is_concrete(vml[c]) ||
6298 6313                      vml[c]->vdev_isspare ||
6299 6314                      vml[c]->vdev_isl2cache ||
6300 6315                      !vdev_writeable(vml[c]) ||
6301 6316                      vml[c]->vdev_children != 0 ||
6302 6317                      vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
6303 6318                      c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
6304 6319                          error = SET_ERROR(EINVAL);
6305 6320                          break;
6306 6321                  }
6307 6322  
6308 6323                  if (vdev_dtl_required(vml[c])) {
6309 6324                          error = SET_ERROR(EBUSY);
6310 6325                          break;
6311 6326                  }
6312 6327  
6313 6328                  /* we need certain info from the top level */
6314 6329                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
6315 6330                      vml[c]->vdev_top->vdev_ms_array) == 0);
6316 6331                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
6317 6332                      vml[c]->vdev_top->vdev_ms_shift) == 0);
6318 6333                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
6319 6334                      vml[c]->vdev_top->vdev_asize) == 0);
6320 6335                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
6321 6336                      vml[c]->vdev_top->vdev_ashift) == 0);
6322 6337  
6323 6338                  /* transfer per-vdev ZAPs */
6324 6339                  ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
6325 6340                  VERIFY0(nvlist_add_uint64(child[c],
6326 6341                      ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
6327 6342  
6328 6343                  ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
6329 6344                  VERIFY0(nvlist_add_uint64(child[c],
6330 6345                      ZPOOL_CONFIG_VDEV_TOP_ZAP,
6331 6346                      vml[c]->vdev_parent->vdev_top_zap));
6332 6347          }
6333 6348  
6334 6349          if (error != 0) {
6335 6350                  kmem_free(vml, children * sizeof (vdev_t *));
6336 6351                  kmem_free(glist, children * sizeof (uint64_t));
6337 6352                  return (spa_vdev_exit(spa, NULL, txg, error));
6338 6353          }
6339 6354  
6340 6355          /* stop writers from using the disks */
6341 6356          for (c = 0; c < children; c++) {
6342 6357                  if (vml[c] != NULL)
6343 6358                          vml[c]->vdev_offline = B_TRUE;
6344 6359          }
6345 6360          vdev_reopen(spa->spa_root_vdev);
6346 6361  
6347 6362          /*
6348 6363           * Temporarily record the splitting vdevs in the spa config.  This
6349 6364           * will disappear once the config is regenerated.
6350 6365           */
6351 6366          VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6352 6367          VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
6353 6368              glist, children) == 0);
6354 6369          kmem_free(glist, children * sizeof (uint64_t));
6355 6370  
6356 6371          mutex_enter(&spa->spa_props_lock);
6357 6372          VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
6358 6373              nvl) == 0);
6359 6374          mutex_exit(&spa->spa_props_lock);
6360 6375          spa->spa_config_splitting = nvl;
6361 6376          vdev_config_dirty(spa->spa_root_vdev);
6362 6377  
6363 6378          /* configure and create the new pool */
6364 6379          VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
6365 6380          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
6366 6381              exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
6367 6382          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
6368 6383              spa_version(spa)) == 0);
6369 6384          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
6370 6385              spa->spa_config_txg) == 0);
6371 6386          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
6372 6387              spa_generate_guid(NULL)) == 0);
6373 6388          VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
6374 6389          (void) nvlist_lookup_string(props,
6375 6390              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
6376 6391  
6377 6392          /* add the new pool to the namespace */
6378 6393          newspa = spa_add(newname, config, altroot);
6379 6394          newspa->spa_avz_action = AVZ_ACTION_REBUILD;
6380 6395          newspa->spa_config_txg = spa->spa_config_txg;
6381 6396          spa_set_log_state(newspa, SPA_LOG_CLEAR);
6382 6397  
6383 6398          /* release the spa config lock, retaining the namespace lock */
6384 6399          spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
6385 6400  
6386 6401          if (zio_injection_enabled)
6387 6402                  zio_handle_panic_injection(spa, FTAG, 1);
6388 6403  
6389 6404          spa_activate(newspa, spa_mode_global);
6390 6405          spa_async_suspend(newspa);
6391 6406  
6392 6407          for (c = 0; c < children; c++) {
6393 6408                  if (vml[c] != NULL) {
6394 6409                          /*
6395 6410                           * Temporarily stop the initializing activity. We set
6396 6411                           * the state to ACTIVE so that we know to resume
6397 6412                           * the initializing once the split has completed.
6398 6413                           */
6399 6414                          mutex_enter(&vml[c]->vdev_initialize_lock);
6400 6415                          vdev_initialize_stop(vml[c], VDEV_INITIALIZE_ACTIVE);
6401 6416                          mutex_exit(&vml[c]->vdev_initialize_lock);
6402 6417                  }
6403 6418          }
6404 6419  
6405 6420          newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
6406 6421  
6407 6422          /* create the new pool from the disks of the original pool */
6408 6423          error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
6409 6424          if (error)
6410 6425                  goto out;
6411 6426  
6412 6427          /* if that worked, generate a real config for the new pool */
6413 6428          if (newspa->spa_root_vdev != NULL) {
6414 6429                  VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
6415 6430                      NV_UNIQUE_NAME, KM_SLEEP) == 0);
6416 6431                  VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
6417 6432                      ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
6418 6433                  spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
6419 6434                      B_TRUE));
6420 6435          }
6421 6436  
6422 6437          /* set the props */
6423 6438          if (props != NULL) {
6424 6439                  spa_configfile_set(newspa, props, B_FALSE);
6425 6440                  error = spa_prop_set(newspa, props);
6426 6441                  if (error)
6427 6442                          goto out;
6428 6443          }
6429 6444  
6430 6445          /* flush everything */
6431 6446          txg = spa_vdev_config_enter(newspa);
6432 6447          vdev_config_dirty(newspa->spa_root_vdev);
6433 6448          (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
6434 6449  
6435 6450          if (zio_injection_enabled)
6436 6451                  zio_handle_panic_injection(spa, FTAG, 2);
6437 6452  
6438 6453          spa_async_resume(newspa);
6439 6454  
6440 6455          /* finally, update the original pool's config */
6441 6456          txg = spa_vdev_config_enter(spa);
6442 6457          tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
6443 6458          error = dmu_tx_assign(tx, TXG_WAIT);
6444 6459          if (error != 0)
6445 6460                  dmu_tx_abort(tx);
6446 6461          for (c = 0; c < children; c++) {
6447 6462                  if (vml[c] != NULL) {
6448 6463                          vdev_split(vml[c]);
6449 6464                          if (error == 0)
6450 6465                                  spa_history_log_internal(spa, "detach", tx,
6451 6466                                      "vdev=%s", vml[c]->vdev_path);
6452 6467  
6453 6468                          vdev_free(vml[c]);
6454 6469                  }
6455 6470          }
6456 6471          spa->spa_avz_action = AVZ_ACTION_REBUILD;
6457 6472          vdev_config_dirty(spa->spa_root_vdev);
6458 6473          spa->spa_config_splitting = NULL;
6459 6474          nvlist_free(nvl);
6460 6475          if (error == 0)
6461 6476                  dmu_tx_commit(tx);
6462 6477          (void) spa_vdev_exit(spa, NULL, txg, 0);
6463 6478  
6464 6479          if (zio_injection_enabled)
6465 6480                  zio_handle_panic_injection(spa, FTAG, 3);
6466 6481  
6467 6482          /* split is complete; log a history record */
6468 6483          spa_history_log_internal(newspa, "split", NULL,
6469 6484              "from pool %s", spa_name(spa));
6470 6485  
6471 6486          kmem_free(vml, children * sizeof (vdev_t *));
6472 6487  
6473 6488          /* if we're not going to mount the filesystems in userland, export */
6474 6489          if (exp)
6475 6490                  error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
6476 6491                      B_FALSE, B_FALSE);
6477 6492  
6478 6493          return (error);
6479 6494  
6480 6495  out:
6481 6496          spa_unload(newspa);
6482 6497          spa_deactivate(newspa);
6483 6498          spa_remove(newspa);
6484 6499  
6485 6500          txg = spa_vdev_config_enter(spa);
6486 6501  
6487 6502          /* re-online all offlined disks */
6488 6503          for (c = 0; c < children; c++) {
6489 6504                  if (vml[c] != NULL)
6490 6505                          vml[c]->vdev_offline = B_FALSE;
6491 6506          }
6492 6507  
6493 6508          /* restart initializing disks as necessary */
6494 6509          spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
6495 6510  
6496 6511          vdev_reopen(spa->spa_root_vdev);
6497 6512  
6498 6513          nvlist_free(spa->spa_config_splitting);
6499 6514          spa->spa_config_splitting = NULL;
6500 6515          (void) spa_vdev_exit(spa, NULL, txg, error);
6501 6516  
6502 6517          kmem_free(vml, children * sizeof (vdev_t *));
6503 6518          return (error);
6504 6519  }
6505 6520  
6506 6521  /*
6507 6522   * Find any device that's done replacing, or a vdev marked 'unspare' that's
6508 6523   * currently spared, so we can detach it.
6509 6524   */
6510 6525  static vdev_t *
6511 6526  spa_vdev_resilver_done_hunt(vdev_t *vd)
6512 6527  {
6513 6528          vdev_t *newvd, *oldvd;
6514 6529  
6515 6530          for (int c = 0; c < vd->vdev_children; c++) {
6516 6531                  oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
6517 6532                  if (oldvd != NULL)
6518 6533                          return (oldvd);
6519 6534          }
6520 6535  
6521 6536          /*
6522 6537           * Check for a completed replacement.  We always consider the first
6523 6538           * vdev in the list to be the oldest vdev, and the last one to be
6524 6539           * the newest (see spa_vdev_attach() for how that works).  In
6525 6540           * the case where the newest vdev is faulted, we will not automatically
6526 6541           * remove it after a resilver completes.  This is OK as it will require
6527 6542           * user intervention to determine which disk the admin wishes to keep.
6528 6543           */
6529 6544          if (vd->vdev_ops == &vdev_replacing_ops) {
6530 6545                  ASSERT(vd->vdev_children > 1);
6531 6546  
6532 6547                  newvd = vd->vdev_child[vd->vdev_children - 1];
6533 6548                  oldvd = vd->vdev_child[0];
6534 6549  
6535 6550                  if (vdev_dtl_empty(newvd, DTL_MISSING) &&
6536 6551                      vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6537 6552                      !vdev_dtl_required(oldvd))
6538 6553                          return (oldvd);
6539 6554          }
6540 6555  
6541 6556          /*
6542 6557           * Check for a completed resilver with the 'unspare' flag set.
6543 6558           * Also potentially update faulted state.
6544 6559           */
6545 6560          if (vd->vdev_ops == &vdev_spare_ops) {
6546 6561                  vdev_t *first = vd->vdev_child[0];
6547 6562                  vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
6548 6563  
6549 6564                  if (last->vdev_unspare) {
6550 6565                          oldvd = first;
6551 6566                          newvd = last;
6552 6567                  } else if (first->vdev_unspare) {
6553 6568                          oldvd = last;
6554 6569                          newvd = first;
6555 6570                  } else {
6556 6571                          oldvd = NULL;
6557 6572                  }
6558 6573  
6559 6574                  if (oldvd != NULL &&
6560 6575                      vdev_dtl_empty(newvd, DTL_MISSING) &&
6561 6576                      vdev_dtl_empty(newvd, DTL_OUTAGE) &&
6562 6577                      !vdev_dtl_required(oldvd))
6563 6578                          return (oldvd);
6564 6579  
6565 6580                  vdev_propagate_state(vd);
6566 6581  
6567 6582                  /*
6568 6583                   * If there are more than two spares attached to a disk,
6569 6584                   * and those spares are not required, then we want to
6570 6585                   * attempt to free them up now so that they can be used
6571 6586                   * by other pools.  Once we're back down to a single
6572 6587                   * disk+spare, we stop removing them.
6573 6588                   */
6574 6589                  if (vd->vdev_children > 2) {
6575 6590                          newvd = vd->vdev_child[1];
6576 6591  
6577 6592                          if (newvd->vdev_isspare && last->vdev_isspare &&
6578 6593                              vdev_dtl_empty(last, DTL_MISSING) &&
6579 6594                              vdev_dtl_empty(last, DTL_OUTAGE) &&
6580 6595                              !vdev_dtl_required(newvd))
6581 6596                                  return (newvd);
6582 6597                  }
6583 6598          }
6584 6599  
6585 6600          return (NULL);
6586 6601  }
6587 6602  
6588 6603  static void
6589 6604  spa_vdev_resilver_done(spa_t *spa)
6590 6605  {
6591 6606          vdev_t *vd, *pvd, *ppvd;
6592 6607          uint64_t guid, sguid, pguid, ppguid;
6593 6608  
6594 6609          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6595 6610  
6596 6611          while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
6597 6612                  pvd = vd->vdev_parent;
6598 6613                  ppvd = pvd->vdev_parent;
6599 6614                  guid = vd->vdev_guid;
6600 6615                  pguid = pvd->vdev_guid;
6601 6616                  ppguid = ppvd->vdev_guid;
6602 6617                  sguid = 0;
6603 6618                  /*
6604 6619                   * If we have just finished replacing a hot spared device, then
6605 6620                   * we need to detach the parent's first child (the original hot
6606 6621                   * spare) as well.
6607 6622                   */
6608 6623                  if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
6609 6624                      ppvd->vdev_children == 2) {
6610 6625                          ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
6611 6626                          sguid = ppvd->vdev_child[1]->vdev_guid;
6612 6627                  }
6613 6628                  ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
6614 6629  
6615 6630                  spa_config_exit(spa, SCL_ALL, FTAG);
6616 6631                  if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
6617 6632                          return;
6618 6633                  if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
6619 6634                          return;
6620 6635                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
6621 6636          }
6622 6637  
6623 6638          spa_config_exit(spa, SCL_ALL, FTAG);
6624 6639  }
6625 6640  
6626 6641  /*
6627 6642   * Update the stored path or FRU for this vdev.
6628 6643   */
6629 6644  int
6630 6645  spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
6631 6646      boolean_t ispath)
6632 6647  {
6633 6648          vdev_t *vd;
6634 6649          boolean_t sync = B_FALSE;
6635 6650  
6636 6651          ASSERT(spa_writeable(spa));
6637 6652  
6638 6653          spa_vdev_state_enter(spa, SCL_ALL);
6639 6654  
6640 6655          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
6641 6656                  return (spa_vdev_state_exit(spa, NULL, ENOENT));
6642 6657  
6643 6658          if (!vd->vdev_ops->vdev_op_leaf)
6644 6659                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
6645 6660  
6646 6661          if (ispath) {
6647 6662                  if (strcmp(value, vd->vdev_path) != 0) {
6648 6663                          spa_strfree(vd->vdev_path);
6649 6664                          vd->vdev_path = spa_strdup(value);
6650 6665                          sync = B_TRUE;
6651 6666                  }
6652 6667          } else {
6653 6668                  if (vd->vdev_fru == NULL) {
6654 6669                          vd->vdev_fru = spa_strdup(value);
6655 6670                          sync = B_TRUE;
6656 6671                  } else if (strcmp(value, vd->vdev_fru) != 0) {
6657 6672                          spa_strfree(vd->vdev_fru);
6658 6673                          vd->vdev_fru = spa_strdup(value);
6659 6674                          sync = B_TRUE;
6660 6675                  }
6661 6676          }
6662 6677  
6663 6678          return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
6664 6679  }
6665 6680  
6666 6681  int
6667 6682  spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
6668 6683  {
6669 6684          return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
6670 6685  }
6671 6686  
6672 6687  int
6673 6688  spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
6674 6689  {
6675 6690          return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
6676 6691  }
6677 6692  
6678 6693  /*
6679 6694   * ==========================================================================
6680 6695   * SPA Scanning
6681 6696   * ==========================================================================
6682 6697   */
6683 6698  int
6684 6699  spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
6685 6700  {
6686 6701          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6687 6702  
6688 6703          if (dsl_scan_resilvering(spa->spa_dsl_pool))
6689 6704                  return (SET_ERROR(EBUSY));
6690 6705  
6691 6706          return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
6692 6707  }
6693 6708  
6694 6709  int
6695 6710  spa_scan_stop(spa_t *spa)
6696 6711  {
6697 6712          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6698 6713          if (dsl_scan_resilvering(spa->spa_dsl_pool))
6699 6714                  return (SET_ERROR(EBUSY));
6700 6715          return (dsl_scan_cancel(spa->spa_dsl_pool));
6701 6716  }
6702 6717  
6703 6718  int
6704 6719  spa_scan(spa_t *spa, pool_scan_func_t func)
6705 6720  {
6706 6721          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
6707 6722  
6708 6723          if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
6709 6724                  return (SET_ERROR(ENOTSUP));
6710 6725  
6711 6726          /*
6712 6727           * If a resilver was requested, but there is no DTL on a
6713 6728           * writeable leaf device, we have nothing to do.
6714 6729           */
6715 6730          if (func == POOL_SCAN_RESILVER &&
6716 6731              !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
6717 6732                  spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
6718 6733                  return (0);
6719 6734          }
6720 6735  
6721 6736          return (dsl_scan(spa->spa_dsl_pool, func));
6722 6737  }
6723 6738  
6724 6739  /*
6725 6740   * ==========================================================================
6726 6741   * SPA async task processing
6727 6742   * ==========================================================================
6728 6743   */
6729 6744  
6730 6745  static void
6731 6746  spa_async_remove(spa_t *spa, vdev_t *vd)
6732 6747  {
6733 6748          if (vd->vdev_remove_wanted) {
6734 6749                  vd->vdev_remove_wanted = B_FALSE;
6735 6750                  vd->vdev_delayed_close = B_FALSE;
6736 6751                  vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
6737 6752  
6738 6753                  /*
6739 6754                   * We want to clear the stats, but we don't want to do a full
6740 6755                   * vdev_clear() as that will cause us to throw away
6741 6756                   * degraded/faulted state as well as attempt to reopen the
6742 6757                   * device, all of which is a waste.
6743 6758                   */
6744 6759                  vd->vdev_stat.vs_read_errors = 0;
6745 6760                  vd->vdev_stat.vs_write_errors = 0;
6746 6761                  vd->vdev_stat.vs_checksum_errors = 0;
6747 6762  
6748 6763                  vdev_state_dirty(vd->vdev_top);
6749 6764          }
6750 6765  
6751 6766          for (int c = 0; c < vd->vdev_children; c++)
6752 6767                  spa_async_remove(spa, vd->vdev_child[c]);
6753 6768  }
6754 6769  
6755 6770  static void
6756 6771  spa_async_probe(spa_t *spa, vdev_t *vd)
6757 6772  {
6758 6773          if (vd->vdev_probe_wanted) {
6759 6774                  vd->vdev_probe_wanted = B_FALSE;
6760 6775                  vdev_reopen(vd);        /* vdev_open() does the actual probe */
6761 6776          }
6762 6777  
6763 6778          for (int c = 0; c < vd->vdev_children; c++)
6764 6779                  spa_async_probe(spa, vd->vdev_child[c]);
6765 6780  }
6766 6781  
6767 6782  static void
6768 6783  spa_async_autoexpand(spa_t *spa, vdev_t *vd)
6769 6784  {
6770 6785          sysevent_id_t eid;
6771 6786          nvlist_t *attr;
6772 6787          char *physpath;
6773 6788  
6774 6789          if (!spa->spa_autoexpand)
6775 6790                  return;
6776 6791  
6777 6792          for (int c = 0; c < vd->vdev_children; c++) {
6778 6793                  vdev_t *cvd = vd->vdev_child[c];
6779 6794                  spa_async_autoexpand(spa, cvd);
6780 6795          }
6781 6796  
6782 6797          if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
6783 6798                  return;
6784 6799  
6785 6800          physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
6786 6801          (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
6787 6802  
6788 6803          VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
6789 6804          VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
6790 6805  
6791 6806          (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
6792 6807              ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
6793 6808  
6794 6809          nvlist_free(attr);
6795 6810          kmem_free(physpath, MAXPATHLEN);
6796 6811  }
6797 6812  
6798 6813  static void
6799 6814  spa_async_thread(void *arg)
6800 6815  {
6801 6816          spa_t *spa = (spa_t *)arg;
6802 6817          int tasks;
6803 6818  
6804 6819          ASSERT(spa->spa_sync_on);
6805 6820  
6806 6821          mutex_enter(&spa->spa_async_lock);
6807 6822          tasks = spa->spa_async_tasks;
6808 6823          spa->spa_async_tasks = 0;
6809 6824          mutex_exit(&spa->spa_async_lock);
6810 6825  
6811 6826          /*
6812 6827           * See if the config needs to be updated.
6813 6828           */
6814 6829          if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
6815 6830                  uint64_t old_space, new_space;
6816 6831  
6817 6832                  mutex_enter(&spa_namespace_lock);
6818 6833                  old_space = metaslab_class_get_space(spa_normal_class(spa));
6819 6834                  spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
6820 6835                  new_space = metaslab_class_get_space(spa_normal_class(spa));
6821 6836                  mutex_exit(&spa_namespace_lock);
6822 6837  
6823 6838                  /*
6824 6839                   * If the pool grew as a result of the config update,
6825 6840                   * then log an internal history event.
6826 6841                   */
6827 6842                  if (new_space != old_space) {
6828 6843                          spa_history_log_internal(spa, "vdev online", NULL,
6829 6844                              "pool '%s' size: %llu(+%llu)",
6830 6845                              spa_name(spa), new_space, new_space - old_space);
6831 6846                  }
6832 6847          }
6833 6848  
6834 6849          /*
6835 6850           * See if any devices need to be marked REMOVED.
6836 6851           */
6837 6852          if (tasks & SPA_ASYNC_REMOVE) {
6838 6853                  spa_vdev_state_enter(spa, SCL_NONE);
6839 6854                  spa_async_remove(spa, spa->spa_root_vdev);
6840 6855                  for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
6841 6856                          spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
6842 6857                  for (int i = 0; i < spa->spa_spares.sav_count; i++)
6843 6858                          spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
6844 6859                  (void) spa_vdev_state_exit(spa, NULL, 0);
6845 6860          }
6846 6861  
6847 6862          if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
6848 6863                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6849 6864                  spa_async_autoexpand(spa, spa->spa_root_vdev);
6850 6865                  spa_config_exit(spa, SCL_CONFIG, FTAG);
6851 6866          }
6852 6867  
6853 6868          /*
6854 6869           * See if any devices need to be probed.
6855 6870           */
6856 6871          if (tasks & SPA_ASYNC_PROBE) {
6857 6872                  spa_vdev_state_enter(spa, SCL_NONE);
6858 6873                  spa_async_probe(spa, spa->spa_root_vdev);
6859 6874                  (void) spa_vdev_state_exit(spa, NULL, 0);
6860 6875          }
6861 6876  
6862 6877          /*
6863 6878           * If any devices are done replacing, detach them.
6864 6879           */
6865 6880          if (tasks & SPA_ASYNC_RESILVER_DONE)
6866 6881                  spa_vdev_resilver_done(spa);
6867 6882  
6868 6883          /*
6869 6884           * Kick off a resilver.
6870 6885           */
6871 6886          if (tasks & SPA_ASYNC_RESILVER)
6872 6887                  dsl_resilver_restart(spa->spa_dsl_pool, 0);
6873 6888  
6874 6889          if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
6875 6890                  mutex_enter(&spa_namespace_lock);
6876 6891                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
6877 6892                  vdev_initialize_restart(spa->spa_root_vdev);
6878 6893                  spa_config_exit(spa, SCL_CONFIG, FTAG);
6879 6894                  mutex_exit(&spa_namespace_lock);
6880 6895          }
6881 6896  
6882 6897          /*
6883 6898           * Let the world know that we're done.
6884 6899           */
6885 6900          mutex_enter(&spa->spa_async_lock);
6886 6901          spa->spa_async_thread = NULL;
6887 6902          cv_broadcast(&spa->spa_async_cv);
6888 6903          mutex_exit(&spa->spa_async_lock);
6889 6904          thread_exit();
6890 6905  }
6891 6906  
6892 6907  void
6893 6908  spa_async_suspend(spa_t *spa)
6894 6909  {
6895 6910          mutex_enter(&spa->spa_async_lock);
6896 6911          spa->spa_async_suspended++;
6897 6912          while (spa->spa_async_thread != NULL)
6898 6913                  cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
6899 6914          mutex_exit(&spa->spa_async_lock);
6900 6915  
6901 6916          spa_vdev_remove_suspend(spa);
6902 6917  
6903 6918          zthr_t *condense_thread = spa->spa_condense_zthr;
6904 6919          if (condense_thread != NULL && zthr_isrunning(condense_thread))
6905 6920                  VERIFY0(zthr_cancel(condense_thread));
6906 6921  
6907 6922          zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
6908 6923          if (discard_thread != NULL && zthr_isrunning(discard_thread))
6909 6924                  VERIFY0(zthr_cancel(discard_thread));
6910 6925  }
6911 6926  
6912 6927  void
6913 6928  spa_async_resume(spa_t *spa)
6914 6929  {
6915 6930          mutex_enter(&spa->spa_async_lock);
6916 6931          ASSERT(spa->spa_async_suspended != 0);
6917 6932          spa->spa_async_suspended--;
6918 6933          mutex_exit(&spa->spa_async_lock);
6919 6934          spa_restart_removal(spa);
6920 6935  
6921 6936          zthr_t *condense_thread = spa->spa_condense_zthr;
6922 6937          if (condense_thread != NULL && !zthr_isrunning(condense_thread))
6923 6938                  zthr_resume(condense_thread);
6924 6939  
6925 6940          zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
6926 6941          if (discard_thread != NULL && !zthr_isrunning(discard_thread))
6927 6942                  zthr_resume(discard_thread);
6928 6943  }
6929 6944  
6930 6945  static boolean_t
6931 6946  spa_async_tasks_pending(spa_t *spa)
6932 6947  {
6933 6948          uint_t non_config_tasks;
6934 6949          uint_t config_task;
6935 6950          boolean_t config_task_suspended;
6936 6951  
6937 6952          non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
6938 6953          config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
6939 6954          if (spa->spa_ccw_fail_time == 0) {
6940 6955                  config_task_suspended = B_FALSE;
6941 6956          } else {
6942 6957                  config_task_suspended =
6943 6958                      (gethrtime() - spa->spa_ccw_fail_time) <
6944 6959                      (zfs_ccw_retry_interval * NANOSEC);
6945 6960          }
6946 6961  
6947 6962          return (non_config_tasks || (config_task && !config_task_suspended));
6948 6963  }
6949 6964  
6950 6965  static void
6951 6966  spa_async_dispatch(spa_t *spa)
6952 6967  {
6953 6968          mutex_enter(&spa->spa_async_lock);
6954 6969          if (spa_async_tasks_pending(spa) &&
6955 6970              !spa->spa_async_suspended &&
6956 6971              spa->spa_async_thread == NULL &&
6957 6972              rootdir != NULL)
6958 6973                  spa->spa_async_thread = thread_create(NULL, 0,
6959 6974                      spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
6960 6975          mutex_exit(&spa->spa_async_lock);
6961 6976  }
6962 6977  
6963 6978  void
6964 6979  spa_async_request(spa_t *spa, int task)
6965 6980  {
6966 6981          zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
6967 6982          mutex_enter(&spa->spa_async_lock);
6968 6983          spa->spa_async_tasks |= task;
6969 6984          mutex_exit(&spa->spa_async_lock);
6970 6985  }
6971 6986  
6972 6987  /*
6973 6988   * ==========================================================================
6974 6989   * SPA syncing routines
6975 6990   * ==========================================================================
6976 6991   */
6977 6992  
6978 6993  static int
6979 6994  bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6980 6995  {
6981 6996          bpobj_t *bpo = arg;
6982 6997          bpobj_enqueue(bpo, bp, tx);
6983 6998          return (0);
6984 6999  }
6985 7000  
6986 7001  static int
6987 7002  spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
6988 7003  {
6989 7004          zio_t *zio = arg;
6990 7005  
6991 7006          zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
6992 7007              zio->io_flags));
6993 7008          return (0);
6994 7009  }
6995 7010  
6996 7011  /*
6997 7012   * Note: this simple function is not inlined to make it easier to dtrace the
6998 7013   * amount of time spent syncing frees.
6999 7014   */
7000 7015  static void
7001 7016  spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
7002 7017  {
7003 7018          zio_t *zio = zio_root(spa, NULL, NULL, 0);
7004 7019          bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
7005 7020          VERIFY(zio_wait(zio) == 0);
7006 7021  }
7007 7022  
7008 7023  /*
7009 7024   * Note: this simple function is not inlined to make it easier to dtrace the
7010 7025   * amount of time spent syncing deferred frees.
7011 7026   */
7012 7027  static void
7013 7028  spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
7014 7029  {
7015 7030          zio_t *zio = zio_root(spa, NULL, NULL, 0);
7016 7031          VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
7017 7032              spa_free_sync_cb, zio, tx), ==, 0);
7018 7033          VERIFY0(zio_wait(zio));
7019 7034  }
7020 7035  
7021 7036  
7022 7037  static void
7023 7038  spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
7024 7039  {
7025 7040          char *packed = NULL;
7026 7041          size_t bufsize;
7027 7042          size_t nvsize = 0;
7028 7043          dmu_buf_t *db;
7029 7044  
7030 7045          VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
7031 7046  
7032 7047          /*
7033 7048           * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
7034 7049           * information.  This avoids the dmu_buf_will_dirty() path and
7035 7050           * saves us a pre-read to get data we don't actually care about.
7036 7051           */
7037 7052          bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
7038 7053          packed = kmem_alloc(bufsize, KM_SLEEP);
7039 7054  
7040 7055          VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
7041 7056              KM_SLEEP) == 0);
7042 7057          bzero(packed + nvsize, bufsize - nvsize);
7043 7058  
7044 7059          dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
7045 7060  
7046 7061          kmem_free(packed, bufsize);
7047 7062  
7048 7063          VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
7049 7064          dmu_buf_will_dirty(db, tx);
7050 7065          *(uint64_t *)db->db_data = nvsize;
7051 7066          dmu_buf_rele(db, FTAG);
7052 7067  }
7053 7068  
7054 7069  static void
7055 7070  spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
7056 7071      const char *config, const char *entry)
7057 7072  {
7058 7073          nvlist_t *nvroot;
7059 7074          nvlist_t **list;
7060 7075          int i;
7061 7076  
7062 7077          if (!sav->sav_sync)
7063 7078                  return;
7064 7079  
7065 7080          /*
7066 7081           * Update the MOS nvlist describing the list of available devices.
7067 7082           * spa_validate_aux() will have already made sure this nvlist is
7068 7083           * valid and the vdevs are labeled appropriately.
7069 7084           */
7070 7085          if (sav->sav_object == 0) {
7071 7086                  sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
7072 7087                      DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
7073 7088                      sizeof (uint64_t), tx);
7074 7089                  VERIFY(zap_update(spa->spa_meta_objset,
7075 7090                      DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
7076 7091                      &sav->sav_object, tx) == 0);
7077 7092          }
7078 7093  
7079 7094          VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
7080 7095          if (sav->sav_count == 0) {
7081 7096                  VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
7082 7097          } else {
7083 7098                  list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
7084 7099                  for (i = 0; i < sav->sav_count; i++)
7085 7100                          list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
7086 7101                              B_FALSE, VDEV_CONFIG_L2CACHE);
7087 7102                  VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
7088 7103                      sav->sav_count) == 0);
7089 7104                  for (i = 0; i < sav->sav_count; i++)
7090 7105                          nvlist_free(list[i]);
7091 7106                  kmem_free(list, sav->sav_count * sizeof (void *));
7092 7107          }
7093 7108  
7094 7109          spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
7095 7110          nvlist_free(nvroot);
7096 7111  
7097 7112          sav->sav_sync = B_FALSE;
7098 7113  }
7099 7114  
7100 7115  /*
7101 7116   * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
7102 7117   * The all-vdev ZAP must be empty.
7103 7118   */
7104 7119  static void
7105 7120  spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
7106 7121  {
7107 7122          spa_t *spa = vd->vdev_spa;
7108 7123          if (vd->vdev_top_zap != 0) {
7109 7124                  VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
7110 7125                      vd->vdev_top_zap, tx));
7111 7126          }
7112 7127          if (vd->vdev_leaf_zap != 0) {
7113 7128                  VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
7114 7129                      vd->vdev_leaf_zap, tx));
7115 7130          }
7116 7131          for (uint64_t i = 0; i < vd->vdev_children; i++) {
7117 7132                  spa_avz_build(vd->vdev_child[i], avz, tx);
7118 7133          }
7119 7134  }
7120 7135  
7121 7136  static void
7122 7137  spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
7123 7138  {
7124 7139          nvlist_t *config;
7125 7140  
7126 7141          /*
7127 7142           * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
7128 7143           * its config may not be dirty but we still need to build per-vdev ZAPs.
7129 7144           * Similarly, if the pool is being assembled (e.g. after a split), we
7130 7145           * need to rebuild the AVZ although the config may not be dirty.
7131 7146           */
7132 7147          if (list_is_empty(&spa->spa_config_dirty_list) &&
7133 7148              spa->spa_avz_action == AVZ_ACTION_NONE)
7134 7149                  return;
7135 7150  
7136 7151          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7137 7152  
7138 7153          ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
7139 7154              spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
7140 7155              spa->spa_all_vdev_zaps != 0);
7141 7156  
7142 7157          if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
7143 7158                  /* Make and build the new AVZ */
7144 7159                  uint64_t new_avz = zap_create(spa->spa_meta_objset,
7145 7160                      DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
7146 7161                  spa_avz_build(spa->spa_root_vdev, new_avz, tx);
7147 7162  
7148 7163                  /* Diff old AVZ with new one */
7149 7164                  zap_cursor_t zc;
7150 7165                  zap_attribute_t za;
7151 7166  
7152 7167                  for (zap_cursor_init(&zc, spa->spa_meta_objset,
7153 7168                      spa->spa_all_vdev_zaps);
7154 7169                      zap_cursor_retrieve(&zc, &za) == 0;
7155 7170                      zap_cursor_advance(&zc)) {
7156 7171                          uint64_t vdzap = za.za_first_integer;
7157 7172                          if (zap_lookup_int(spa->spa_meta_objset, new_avz,
7158 7173                              vdzap) == ENOENT) {
7159 7174                                  /*
7160 7175                                   * ZAP is listed in old AVZ but not in new one;
7161 7176                                   * destroy it
7162 7177                                   */
7163 7178                                  VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
7164 7179                                      tx));
7165 7180                          }
7166 7181                  }
7167 7182  
7168 7183                  zap_cursor_fini(&zc);
7169 7184  
7170 7185                  /* Destroy the old AVZ */
7171 7186                  VERIFY0(zap_destroy(spa->spa_meta_objset,
7172 7187                      spa->spa_all_vdev_zaps, tx));
7173 7188  
7174 7189                  /* Replace the old AVZ in the dir obj with the new one */
7175 7190                  VERIFY0(zap_update(spa->spa_meta_objset,
7176 7191                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
7177 7192                      sizeof (new_avz), 1, &new_avz, tx));
7178 7193  
7179 7194                  spa->spa_all_vdev_zaps = new_avz;
7180 7195          } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
7181 7196                  zap_cursor_t zc;
7182 7197                  zap_attribute_t za;
7183 7198  
7184 7199                  /* Walk through the AVZ and destroy all listed ZAPs */
7185 7200                  for (zap_cursor_init(&zc, spa->spa_meta_objset,
7186 7201                      spa->spa_all_vdev_zaps);
7187 7202                      zap_cursor_retrieve(&zc, &za) == 0;
7188 7203                      zap_cursor_advance(&zc)) {
7189 7204                          uint64_t zap = za.za_first_integer;
7190 7205                          VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
7191 7206                  }
7192 7207  
7193 7208                  zap_cursor_fini(&zc);
7194 7209  
7195 7210                  /* Destroy and unlink the AVZ itself */
7196 7211                  VERIFY0(zap_destroy(spa->spa_meta_objset,
7197 7212                      spa->spa_all_vdev_zaps, tx));
7198 7213                  VERIFY0(zap_remove(spa->spa_meta_objset,
7199 7214                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
7200 7215                  spa->spa_all_vdev_zaps = 0;
7201 7216          }
7202 7217  
7203 7218          if (spa->spa_all_vdev_zaps == 0) {
7204 7219                  spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
7205 7220                      DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
7206 7221                      DMU_POOL_VDEV_ZAP_MAP, tx);
7207 7222          }
7208 7223          spa->spa_avz_action = AVZ_ACTION_NONE;
7209 7224  
7210 7225          /* Create ZAPs for vdevs that don't have them. */
7211 7226          vdev_construct_zaps(spa->spa_root_vdev, tx);
7212 7227  
7213 7228          config = spa_config_generate(spa, spa->spa_root_vdev,
7214 7229              dmu_tx_get_txg(tx), B_FALSE);
7215 7230  
7216 7231          /*
7217 7232           * If we're upgrading the spa version then make sure that
7218 7233           * the config object gets updated with the correct version.
7219 7234           */
7220 7235          if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
7221 7236                  fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
7222 7237                      spa->spa_uberblock.ub_version);
7223 7238  
7224 7239          spa_config_exit(spa, SCL_STATE, FTAG);
7225 7240  
7226 7241          nvlist_free(spa->spa_config_syncing);
7227 7242          spa->spa_config_syncing = config;
7228 7243  
7229 7244          spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
7230 7245  }
7231 7246  
7232 7247  static void
7233 7248  spa_sync_version(void *arg, dmu_tx_t *tx)
7234 7249  {
7235 7250          uint64_t *versionp = arg;
7236 7251          uint64_t version = *versionp;
7237 7252          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7238 7253  
7239 7254          /*
7240 7255           * Setting the version is special cased when first creating the pool.
7241 7256           */
7242 7257          ASSERT(tx->tx_txg != TXG_INITIAL);
7243 7258  
7244 7259          ASSERT(SPA_VERSION_IS_SUPPORTED(version));
7245 7260          ASSERT(version >= spa_version(spa));
7246 7261  
7247 7262          spa->spa_uberblock.ub_version = version;
7248 7263          vdev_config_dirty(spa->spa_root_vdev);
7249 7264          spa_history_log_internal(spa, "set", tx, "version=%lld", version);
7250 7265  }
7251 7266  
7252 7267  /*
7253 7268   * Set zpool properties.
7254 7269   */
7255 7270  static void
7256 7271  spa_sync_props(void *arg, dmu_tx_t *tx)
7257 7272  {
7258 7273          nvlist_t *nvp = arg;
7259 7274          spa_t *spa = dmu_tx_pool(tx)->dp_spa;
7260 7275          objset_t *mos = spa->spa_meta_objset;
7261 7276          nvpair_t *elem = NULL;
7262 7277  
7263 7278          mutex_enter(&spa->spa_props_lock);
7264 7279  
7265 7280          while ((elem = nvlist_next_nvpair(nvp, elem))) {
7266 7281                  uint64_t intval;
7267 7282                  char *strval, *fname;
7268 7283                  zpool_prop_t prop;
7269 7284                  const char *propname;
7270 7285                  zprop_type_t proptype;
7271 7286                  spa_feature_t fid;
7272 7287  
7273 7288                  switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
7274 7289                  case ZPOOL_PROP_INVAL:
7275 7290                          /*
7276 7291                           * We checked this earlier in spa_prop_validate().
7277 7292                           */
7278 7293                          ASSERT(zpool_prop_feature(nvpair_name(elem)));
7279 7294  
7280 7295                          fname = strchr(nvpair_name(elem), '@') + 1;
7281 7296                          VERIFY0(zfeature_lookup_name(fname, &fid));
7282 7297  
7283 7298                          spa_feature_enable(spa, fid, tx);
7284 7299                          spa_history_log_internal(spa, "set", tx,
7285 7300                              "%s=enabled", nvpair_name(elem));
7286 7301                          break;
7287 7302  
7288 7303                  case ZPOOL_PROP_VERSION:
7289 7304                          intval = fnvpair_value_uint64(elem);
7290 7305                          /*
7291 7306                           * The version is synced seperatly before other
7292 7307                           * properties and should be correct by now.
7293 7308                           */
7294 7309                          ASSERT3U(spa_version(spa), >=, intval);
7295 7310                          break;
7296 7311  
7297 7312                  case ZPOOL_PROP_ALTROOT:
7298 7313                          /*
7299 7314                           * 'altroot' is a non-persistent property. It should
7300 7315                           * have been set temporarily at creation or import time.
7301 7316                           */
7302 7317                          ASSERT(spa->spa_root != NULL);
7303 7318                          break;
7304 7319  
7305 7320                  case ZPOOL_PROP_READONLY:
7306 7321                  case ZPOOL_PROP_CACHEFILE:
7307 7322                          /*
7308 7323                           * 'readonly' and 'cachefile' are also non-persisitent
7309 7324                           * properties.
7310 7325                           */
7311 7326                          break;
7312 7327                  case ZPOOL_PROP_COMMENT:
7313 7328                          strval = fnvpair_value_string(elem);
7314 7329                          if (spa->spa_comment != NULL)
7315 7330                                  spa_strfree(spa->spa_comment);
7316 7331                          spa->spa_comment = spa_strdup(strval);
7317 7332                          /*
7318 7333                           * We need to dirty the configuration on all the vdevs
7319 7334                           * so that their labels get updated.  It's unnecessary
7320 7335                           * to do this for pool creation since the vdev's
7321 7336                           * configuratoin has already been dirtied.
7322 7337                           */
7323 7338                          if (tx->tx_txg != TXG_INITIAL)
7324 7339                                  vdev_config_dirty(spa->spa_root_vdev);
7325 7340                          spa_history_log_internal(spa, "set", tx,
7326 7341                              "%s=%s", nvpair_name(elem), strval);
7327 7342                          break;
7328 7343                  default:
7329 7344                          /*
7330 7345                           * Set pool property values in the poolprops mos object.
7331 7346                           */
7332 7347                          if (spa->spa_pool_props_object == 0) {
7333 7348                                  spa->spa_pool_props_object =
7334 7349                                      zap_create_link(mos, DMU_OT_POOL_PROPS,
7335 7350                                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
7336 7351                                      tx);
7337 7352                          }
7338 7353  
7339 7354                          /* normalize the property name */
7340 7355                          propname = zpool_prop_to_name(prop);
7341 7356                          proptype = zpool_prop_get_type(prop);
7342 7357  
7343 7358                          if (nvpair_type(elem) == DATA_TYPE_STRING) {
7344 7359                                  ASSERT(proptype == PROP_TYPE_STRING);
7345 7360                                  strval = fnvpair_value_string(elem);
7346 7361                                  VERIFY0(zap_update(mos,
7347 7362                                      spa->spa_pool_props_object, propname,
7348 7363                                      1, strlen(strval) + 1, strval, tx));
7349 7364                                  spa_history_log_internal(spa, "set", tx,
7350 7365                                      "%s=%s", nvpair_name(elem), strval);
7351 7366                          } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
7352 7367                                  intval = fnvpair_value_uint64(elem);
7353 7368  
7354 7369                                  if (proptype == PROP_TYPE_INDEX) {
7355 7370                                          const char *unused;
7356 7371                                          VERIFY0(zpool_prop_index_to_string(
7357 7372                                              prop, intval, &unused));
7358 7373                                  }
7359 7374                                  VERIFY0(zap_update(mos,
7360 7375                                      spa->spa_pool_props_object, propname,
7361 7376                                      8, 1, &intval, tx));
7362 7377                                  spa_history_log_internal(spa, "set", tx,
7363 7378                                      "%s=%lld", nvpair_name(elem), intval);
7364 7379                          } else {
7365 7380                                  ASSERT(0); /* not allowed */
7366 7381                          }
7367 7382  
7368 7383                          switch (prop) {
7369 7384                          case ZPOOL_PROP_DELEGATION:
7370 7385                                  spa->spa_delegation = intval;
7371 7386                                  break;
7372 7387                          case ZPOOL_PROP_BOOTFS:
7373 7388                                  spa->spa_bootfs = intval;
7374 7389                                  break;
7375 7390                          case ZPOOL_PROP_FAILUREMODE:
7376 7391                                  spa->spa_failmode = intval;
7377 7392                                  break;
7378 7393                          case ZPOOL_PROP_AUTOEXPAND:
7379 7394                                  spa->spa_autoexpand = intval;
7380 7395                                  if (tx->tx_txg != TXG_INITIAL)
7381 7396                                          spa_async_request(spa,
7382 7397                                              SPA_ASYNC_AUTOEXPAND);
7383 7398                                  break;
7384 7399                          case ZPOOL_PROP_DEDUPDITTO:
7385 7400                                  spa->spa_dedup_ditto = intval;
7386 7401                                  break;
7387 7402                          default:
7388 7403                                  break;
7389 7404                          }
7390 7405                  }
7391 7406  
7392 7407          }
7393 7408  
7394 7409          mutex_exit(&spa->spa_props_lock);
7395 7410  }
7396 7411  
7397 7412  /*
7398 7413   * Perform one-time upgrade on-disk changes.  spa_version() does not
7399 7414   * reflect the new version this txg, so there must be no changes this
7400 7415   * txg to anything that the upgrade code depends on after it executes.
7401 7416   * Therefore this must be called after dsl_pool_sync() does the sync
7402 7417   * tasks.
7403 7418   */
7404 7419  static void
7405 7420  spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
7406 7421  {
7407 7422          dsl_pool_t *dp = spa->spa_dsl_pool;
7408 7423  
7409 7424          ASSERT(spa->spa_sync_pass == 1);
7410 7425  
7411 7426          rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
7412 7427  
7413 7428          if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
7414 7429              spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
7415 7430                  dsl_pool_create_origin(dp, tx);
7416 7431  
7417 7432                  /* Keeping the origin open increases spa_minref */
7418 7433                  spa->spa_minref += 3;
7419 7434          }
7420 7435  
7421 7436          if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
7422 7437              spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
7423 7438                  dsl_pool_upgrade_clones(dp, tx);
7424 7439          }
7425 7440  
7426 7441          if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
7427 7442              spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
7428 7443                  dsl_pool_upgrade_dir_clones(dp, tx);
7429 7444  
7430 7445                  /* Keeping the freedir open increases spa_minref */
7431 7446                  spa->spa_minref += 3;
7432 7447          }
7433 7448  
7434 7449          if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
7435 7450              spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
7436 7451                  spa_feature_create_zap_objects(spa, tx);
7437 7452          }
7438 7453  
7439 7454          /*
7440 7455           * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
7441 7456           * when possibility to use lz4 compression for metadata was added
7442 7457           * Old pools that have this feature enabled must be upgraded to have
7443 7458           * this feature active
7444 7459           */
7445 7460          if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
7446 7461                  boolean_t lz4_en = spa_feature_is_enabled(spa,
7447 7462                      SPA_FEATURE_LZ4_COMPRESS);
7448 7463                  boolean_t lz4_ac = spa_feature_is_active(spa,
7449 7464                      SPA_FEATURE_LZ4_COMPRESS);
7450 7465  
7451 7466                  if (lz4_en && !lz4_ac)
7452 7467                          spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
7453 7468          }
7454 7469  
7455 7470          /*
7456 7471           * If we haven't written the salt, do so now.  Note that the
7457 7472           * feature may not be activated yet, but that's fine since
7458 7473           * the presence of this ZAP entry is backwards compatible.
7459 7474           */
7460 7475          if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
7461 7476              DMU_POOL_CHECKSUM_SALT) == ENOENT) {
7462 7477                  VERIFY0(zap_add(spa->spa_meta_objset,
7463 7478                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
7464 7479                      sizeof (spa->spa_cksum_salt.zcs_bytes),
7465 7480                      spa->spa_cksum_salt.zcs_bytes, tx));
7466 7481          }
7467 7482  
7468 7483          rrw_exit(&dp->dp_config_rwlock, FTAG);
7469 7484  }
7470 7485  
7471 7486  static void
7472 7487  vdev_indirect_state_sync_verify(vdev_t *vd)
7473 7488  {
7474 7489          vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping;
7475 7490          vdev_indirect_births_t *vib = vd->vdev_indirect_births;
7476 7491  
7477 7492          if (vd->vdev_ops == &vdev_indirect_ops) {
7478 7493                  ASSERT(vim != NULL);
7479 7494                  ASSERT(vib != NULL);
7480 7495          }
7481 7496  
7482 7497          if (vdev_obsolete_sm_object(vd) != 0) {
7483 7498                  ASSERT(vd->vdev_obsolete_sm != NULL);
7484 7499                  ASSERT(vd->vdev_removing ||
7485 7500                      vd->vdev_ops == &vdev_indirect_ops);
7486 7501                  ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
7487 7502                  ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
7488 7503  
7489 7504                  ASSERT3U(vdev_obsolete_sm_object(vd), ==,
7490 7505                      space_map_object(vd->vdev_obsolete_sm));
7491 7506                  ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
7492 7507                      space_map_allocated(vd->vdev_obsolete_sm));
7493 7508          }
7494 7509          ASSERT(vd->vdev_obsolete_segments != NULL);
7495 7510  
7496 7511          /*
7497 7512           * Since frees / remaps to an indirect vdev can only
7498 7513           * happen in syncing context, the obsolete segments
7499 7514           * tree must be empty when we start syncing.
7500 7515           */
7501 7516          ASSERT0(range_tree_space(vd->vdev_obsolete_segments));
7502 7517  }
7503 7518  
7504 7519  /*
7505 7520   * Sync the specified transaction group.  New blocks may be dirtied as
7506 7521   * part of the process, so we iterate until it converges.
7507 7522   */
7508 7523  void
7509 7524  spa_sync(spa_t *spa, uint64_t txg)
7510 7525  {
7511 7526          dsl_pool_t *dp = spa->spa_dsl_pool;
7512 7527          objset_t *mos = spa->spa_meta_objset;
7513 7528          bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
7514 7529          vdev_t *rvd = spa->spa_root_vdev;
7515 7530          vdev_t *vd;
7516 7531          dmu_tx_t *tx;
7517 7532          int error;
7518 7533          uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
7519 7534              zfs_vdev_queue_depth_pct / 100;
7520 7535  
7521 7536          VERIFY(spa_writeable(spa));
7522 7537  
7523 7538          /*
7524 7539           * Wait for i/os issued in open context that need to complete
7525 7540           * before this txg syncs.
7526 7541           */
7527 7542          (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
7528 7543          spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
7529 7544              ZIO_FLAG_CANFAIL);
7530 7545  
7531 7546          /*
7532 7547           * Lock out configuration changes.
7533 7548           */
7534 7549          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
7535 7550  
7536 7551          spa->spa_syncing_txg = txg;
7537 7552          spa->spa_sync_pass = 0;
7538 7553  
7539 7554          for (int i = 0; i < spa->spa_alloc_count; i++) {
7540 7555                  mutex_enter(&spa->spa_alloc_locks[i]);
7541 7556                  VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
7542 7557                  mutex_exit(&spa->spa_alloc_locks[i]);
7543 7558          }
7544 7559  
7545 7560          /*
7546 7561           * If there are any pending vdev state changes, convert them
7547 7562           * into config changes that go out with this transaction group.
7548 7563           */
7549 7564          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7550 7565          while (list_head(&spa->spa_state_dirty_list) != NULL) {
7551 7566                  /*
7552 7567                   * We need the write lock here because, for aux vdevs,
7553 7568                   * calling vdev_config_dirty() modifies sav_config.
7554 7569                   * This is ugly and will become unnecessary when we
7555 7570                   * eliminate the aux vdev wart by integrating all vdevs
7556 7571                   * into the root vdev tree.
7557 7572                   */
7558 7573                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7559 7574                  spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
7560 7575                  while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
7561 7576                          vdev_state_clean(vd);
7562 7577                          vdev_config_dirty(vd);
7563 7578                  }
7564 7579                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
7565 7580                  spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
7566 7581          }
7567 7582          spa_config_exit(spa, SCL_STATE, FTAG);
7568 7583  
7569 7584          tx = dmu_tx_create_assigned(dp, txg);
7570 7585  
7571 7586          spa->spa_sync_starttime = gethrtime();
7572 7587          VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
7573 7588              spa->spa_sync_starttime + spa->spa_deadman_synctime));
7574 7589  
7575 7590          /*
7576 7591           * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
7577 7592           * set spa_deflate if we have no raid-z vdevs.
7578 7593           */
7579 7594          if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
7580 7595              spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
7581 7596                  int i;
7582 7597  
7583 7598                  for (i = 0; i < rvd->vdev_children; i++) {
7584 7599                          vd = rvd->vdev_child[i];
7585 7600                          if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
7586 7601                                  break;
7587 7602                  }
7588 7603                  if (i == rvd->vdev_children) {
7589 7604                          spa->spa_deflate = TRUE;
7590 7605                          VERIFY(0 == zap_add(spa->spa_meta_objset,
7591 7606                              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
7592 7607                              sizeof (uint64_t), 1, &spa->spa_deflate, tx));
7593 7608                  }
7594 7609          }
7595 7610  
7596 7611          /*
7597 7612           * Set the top-level vdev's max queue depth. Evaluate each
7598 7613           * top-level's async write queue depth in case it changed.
7599 7614           * The max queue depth will not change in the middle of syncing
7600 7615           * out this txg.
7601 7616           */
7602 7617          uint64_t slots_per_allocator = 0;
7603 7618          for (int c = 0; c < rvd->vdev_children; c++) {
7604 7619                  vdev_t *tvd = rvd->vdev_child[c];
7605 7620                  metaslab_group_t *mg = tvd->vdev_mg;
7606 7621  
7607 7622                  if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
7608 7623                      !metaslab_group_initialized(mg))
7609 7624                          continue;
7610 7625  
7611 7626                  /*
7612 7627                   * It is safe to do a lock-free check here because only async
7613 7628                   * allocations look at mg_max_alloc_queue_depth, and async
7614 7629                   * allocations all happen from spa_sync().
7615 7630                   */
7616 7631                  for (int i = 0; i < spa->spa_alloc_count; i++)
7617 7632                          ASSERT0(refcount_count(&(mg->mg_alloc_queue_depth[i])));
7618 7633                  mg->mg_max_alloc_queue_depth = max_queue_depth;
7619 7634  
7620 7635                  for (int i = 0; i < spa->spa_alloc_count; i++) {
7621 7636                          mg->mg_cur_max_alloc_queue_depth[i] =
7622 7637                              zfs_vdev_def_queue_depth;
7623 7638                  }
7624 7639                  slots_per_allocator += zfs_vdev_def_queue_depth;
7625 7640          }
7626 7641          metaslab_class_t *mc = spa_normal_class(spa);
7627 7642          for (int i = 0; i < spa->spa_alloc_count; i++) {
7628 7643                  ASSERT0(refcount_count(&mc->mc_alloc_slots[i]));
7629 7644                  mc->mc_alloc_max_slots[i] = slots_per_allocator;
7630 7645          }
7631 7646          mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;
7632 7647  
7633 7648          for (int c = 0; c < rvd->vdev_children; c++) {
7634 7649                  vdev_t *vd = rvd->vdev_child[c];
7635 7650                  vdev_indirect_state_sync_verify(vd);
7636 7651  
7637 7652                  if (vdev_indirect_should_condense(vd)) {
7638 7653                          spa_condense_indirect_start_sync(vd, tx);
7639 7654                          break;
7640 7655                  }
7641 7656          }
7642 7657  
7643 7658          /*
7644 7659           * Iterate to convergence.
7645 7660           */
7646 7661          do {
7647 7662                  int pass = ++spa->spa_sync_pass;
7648 7663  
7649 7664                  spa_sync_config_object(spa, tx);
7650 7665                  spa_sync_aux_dev(spa, &spa->spa_spares, tx,
7651 7666                      ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
7652 7667                  spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
7653 7668                      ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
7654 7669                  spa_errlog_sync(spa, txg);
7655 7670                  dsl_pool_sync(dp, txg);
7656 7671  
7657 7672                  if (pass < zfs_sync_pass_deferred_free) {
7658 7673                          spa_sync_frees(spa, free_bpl, tx);
7659 7674                  } else {
7660 7675                          /*
7661 7676                           * We can not defer frees in pass 1, because
7662 7677                           * we sync the deferred frees later in pass 1.
7663 7678                           */
7664 7679                          ASSERT3U(pass, >, 1);
7665 7680                          bplist_iterate(free_bpl, bpobj_enqueue_cb,
7666 7681                              &spa->spa_deferred_bpobj, tx);
7667 7682                  }
7668 7683  
7669 7684                  ddt_sync(spa, txg);
7670 7685                  dsl_scan_sync(dp, tx);
7671 7686  
7672 7687                  if (spa->spa_vdev_removal != NULL)
7673 7688                          svr_sync(spa, tx);
7674 7689  
7675 7690                  while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
7676 7691                      != NULL)
7677 7692                          vdev_sync(vd, txg);
7678 7693  
7679 7694                  if (pass == 1) {
7680 7695                          spa_sync_upgrades(spa, tx);
7681 7696                          ASSERT3U(txg, >=,
7682 7697                              spa->spa_uberblock.ub_rootbp.blk_birth);
7683 7698                          /*
7684 7699                           * Note: We need to check if the MOS is dirty
7685 7700                           * because we could have marked the MOS dirty
7686 7701                           * without updating the uberblock (e.g. if we
7687 7702                           * have sync tasks but no dirty user data).  We
7688 7703                           * need to check the uberblock's rootbp because
7689 7704                           * it is updated if we have synced out dirty
7690 7705                           * data (though in this case the MOS will most
7691 7706                           * likely also be dirty due to second order
7692 7707                           * effects, we don't want to rely on that here).
7693 7708                           */
7694 7709                          if (spa->spa_uberblock.ub_rootbp.blk_birth < txg &&
7695 7710                              !dmu_objset_is_dirty(mos, txg)) {
7696 7711                                  /*
7697 7712                                   * Nothing changed on the first pass,
7698 7713                                   * therefore this TXG is a no-op.  Avoid
7699 7714                                   * syncing deferred frees, so that we
7700 7715                                   * can keep this TXG as a no-op.
7701 7716                                   */
7702 7717                                  ASSERT(txg_list_empty(&dp->dp_dirty_datasets,
7703 7718                                      txg));
7704 7719                                  ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
7705 7720                                  ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
7706 7721                                  ASSERT(txg_list_empty(&dp->dp_early_sync_tasks,
7707 7722                                      txg));
7708 7723                                  break;
7709 7724                          }
7710 7725                          spa_sync_deferred_frees(spa, tx);
7711 7726                  }
7712 7727  
7713 7728          } while (dmu_objset_is_dirty(mos, txg));
7714 7729  
7715 7730          if (!list_is_empty(&spa->spa_config_dirty_list)) {
7716 7731                  /*
7717 7732                   * Make sure that the number of ZAPs for all the vdevs matches
7718 7733                   * the number of ZAPs in the per-vdev ZAP list. This only gets
7719 7734                   * called if the config is dirty; otherwise there may be
7720 7735                   * outstanding AVZ operations that weren't completed in
7721 7736                   * spa_sync_config_object.
7722 7737                   */
7723 7738                  uint64_t all_vdev_zap_entry_count;
7724 7739                  ASSERT0(zap_count(spa->spa_meta_objset,
7725 7740                      spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
7726 7741                  ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
7727 7742                      all_vdev_zap_entry_count);
7728 7743          }
7729 7744  
7730 7745          if (spa->spa_vdev_removal != NULL) {
7731 7746                  ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
7732 7747          }
7733 7748  
7734 7749          /*
7735 7750           * Rewrite the vdev configuration (which includes the uberblock)
7736 7751           * to commit the transaction group.
7737 7752           *
7738 7753           * If there are no dirty vdevs, we sync the uberblock to a few
7739 7754           * random top-level vdevs that are known to be visible in the
7740 7755           * config cache (see spa_vdev_add() for a complete description).
7741 7756           * If there *are* dirty vdevs, sync the uberblock to all vdevs.
7742 7757           */
7743 7758          for (;;) {
7744 7759                  /*
7745 7760                   * We hold SCL_STATE to prevent vdev open/close/etc.
7746 7761                   * while we're attempting to write the vdev labels.
7747 7762                   */
7748 7763                  spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
7749 7764  
7750 7765                  if (list_is_empty(&spa->spa_config_dirty_list)) {
7751 7766                          vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
7752 7767                          int svdcount = 0;
7753 7768                          int children = rvd->vdev_children;
7754 7769                          int c0 = spa_get_random(children);
7755 7770  
7756 7771                          for (int c = 0; c < children; c++) {
7757 7772                                  vd = rvd->vdev_child[(c0 + c) % children];
7758 7773  
7759 7774                                  /* Stop when revisiting the first vdev */
7760 7775                                  if (c > 0 && svd[0] == vd)
7761 7776                                          break;
7762 7777  
7763 7778                                  if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
7764 7779                                      !vdev_is_concrete(vd))
7765 7780                                          continue;
7766 7781  
7767 7782                                  svd[svdcount++] = vd;
7768 7783                                  if (svdcount == SPA_SYNC_MIN_VDEVS)
7769 7784                                          break;
7770 7785                          }
7771 7786                          error = vdev_config_sync(svd, svdcount, txg);
7772 7787                  } else {
7773 7788                          error = vdev_config_sync(rvd->vdev_child,
7774 7789                              rvd->vdev_children, txg);
7775 7790                  }
7776 7791  
7777 7792                  if (error == 0)
7778 7793                          spa->spa_last_synced_guid = rvd->vdev_guid;
7779 7794  
7780 7795                  spa_config_exit(spa, SCL_STATE, FTAG);
7781 7796  
7782 7797                  if (error == 0)
7783 7798                          break;
7784 7799                  zio_suspend(spa, NULL);
7785 7800                  zio_resume_wait(spa);
7786 7801          }
7787 7802          dmu_tx_commit(tx);
7788 7803  
7789 7804          VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
7790 7805  
7791 7806          /*
7792 7807           * Clear the dirty config list.
7793 7808           */
7794 7809          while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
7795 7810                  vdev_config_clean(vd);
7796 7811  
7797 7812          /*
7798 7813           * Now that the new config has synced transactionally,
7799 7814           * let it become visible to the config cache.
7800 7815           */
7801 7816          if (spa->spa_config_syncing != NULL) {
7802 7817                  spa_config_set(spa, spa->spa_config_syncing);
7803 7818                  spa->spa_config_txg = txg;
7804 7819                  spa->spa_config_syncing = NULL;
7805 7820          }
7806 7821  
7807 7822          dsl_pool_sync_done(dp, txg);
7808 7823  
7809 7824          for (int i = 0; i < spa->spa_alloc_count; i++) {
7810 7825                  mutex_enter(&spa->spa_alloc_locks[i]);
7811 7826                  VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i]));
7812 7827                  mutex_exit(&spa->spa_alloc_locks[i]);
7813 7828          }
7814 7829  
7815 7830          /*
7816 7831           * Update usable space statistics.
7817 7832           */
7818 7833          while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
7819 7834              != NULL)
7820 7835                  vdev_sync_done(vd, txg);
7821 7836  
7822 7837          spa_update_dspace(spa);
7823 7838  
7824 7839          /*
7825 7840           * It had better be the case that we didn't dirty anything
7826 7841           * since vdev_config_sync().
7827 7842           */
7828 7843          ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
7829 7844          ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
7830 7845          ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
7831 7846  
7832 7847          while (zfs_pause_spa_sync)
7833 7848                  delay(1);
7834 7849  
7835 7850          spa->spa_sync_pass = 0;
7836 7851  
7837 7852          /*
7838 7853           * Update the last synced uberblock here. We want to do this at
7839 7854           * the end of spa_sync() so that consumers of spa_last_synced_txg()
7840 7855           * will be guaranteed that all the processing associated with
7841 7856           * that txg has been completed.
7842 7857           */
7843 7858          spa->spa_ubsync = spa->spa_uberblock;
7844 7859          spa_config_exit(spa, SCL_CONFIG, FTAG);
7845 7860  
7846 7861          spa_handle_ignored_writes(spa);
7847 7862  
7848 7863          /*
7849 7864           * If any async tasks have been requested, kick them off.
7850 7865           */
7851 7866          spa_async_dispatch(spa);
7852 7867  }
7853 7868  
7854 7869  /*
7855 7870   * Sync all pools.  We don't want to hold the namespace lock across these
7856 7871   * operations, so we take a reference on the spa_t and drop the lock during the
7857 7872   * sync.
7858 7873   */
7859 7874  void
7860 7875  spa_sync_allpools(void)
7861 7876  {
7862 7877          spa_t *spa = NULL;
7863 7878          mutex_enter(&spa_namespace_lock);
7864 7879          while ((spa = spa_next(spa)) != NULL) {
7865 7880                  if (spa_state(spa) != POOL_STATE_ACTIVE ||
7866 7881                      !spa_writeable(spa) || spa_suspended(spa))
7867 7882                          continue;
7868 7883                  spa_open_ref(spa, FTAG);
7869 7884                  mutex_exit(&spa_namespace_lock);
7870 7885                  txg_wait_synced(spa_get_dsl(spa), 0);
7871 7886                  mutex_enter(&spa_namespace_lock);
7872 7887                  spa_close(spa, FTAG);
7873 7888          }
7874 7889          mutex_exit(&spa_namespace_lock);
7875 7890  }
7876 7891  
7877 7892  /*
7878 7893   * ==========================================================================
7879 7894   * Miscellaneous routines
7880 7895   * ==========================================================================
7881 7896   */
7882 7897  
7883 7898  /*
7884 7899   * Remove all pools in the system.
7885 7900   */
7886 7901  void
7887 7902  spa_evict_all(void)
7888 7903  {
7889 7904          spa_t *spa;
7890 7905  
7891 7906          /*
7892 7907           * Remove all cached state.  All pools should be closed now,
7893 7908           * so every spa in the AVL tree should be unreferenced.
7894 7909           */
7895 7910          mutex_enter(&spa_namespace_lock);
7896 7911          while ((spa = spa_next(NULL)) != NULL) {
7897 7912                  /*
7898 7913                   * Stop async tasks.  The async thread may need to detach
7899 7914                   * a device that's been replaced, which requires grabbing
7900 7915                   * spa_namespace_lock, so we must drop it here.
7901 7916                   */
7902 7917                  spa_open_ref(spa, FTAG);
7903 7918                  mutex_exit(&spa_namespace_lock);
7904 7919                  spa_async_suspend(spa);
7905 7920                  mutex_enter(&spa_namespace_lock);
7906 7921                  spa_close(spa, FTAG);
7907 7922  
7908 7923                  if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
7909 7924                          spa_unload(spa);
7910 7925                          spa_deactivate(spa);
7911 7926                  }
7912 7927                  spa_remove(spa);
7913 7928          }
7914 7929          mutex_exit(&spa_namespace_lock);
7915 7930  }
7916 7931  
7917 7932  vdev_t *
7918 7933  spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
7919 7934  {
7920 7935          vdev_t *vd;
7921 7936          int i;
7922 7937  
7923 7938          if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
7924 7939                  return (vd);
7925 7940  
7926 7941          if (aux) {
7927 7942                  for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
7928 7943                          vd = spa->spa_l2cache.sav_vdevs[i];
7929 7944                          if (vd->vdev_guid == guid)
7930 7945                                  return (vd);
7931 7946                  }
7932 7947  
7933 7948                  for (i = 0; i < spa->spa_spares.sav_count; i++) {
7934 7949                          vd = spa->spa_spares.sav_vdevs[i];
7935 7950                          if (vd->vdev_guid == guid)
7936 7951                                  return (vd);
7937 7952                  }
7938 7953          }
7939 7954  
7940 7955          return (NULL);
7941 7956  }
7942 7957  
7943 7958  void
7944 7959  spa_upgrade(spa_t *spa, uint64_t version)
7945 7960  {
7946 7961          ASSERT(spa_writeable(spa));
7947 7962  
7948 7963          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
7949 7964  
7950 7965          /*
7951 7966           * This should only be called for a non-faulted pool, and since a
7952 7967           * future version would result in an unopenable pool, this shouldn't be
7953 7968           * possible.
7954 7969           */
7955 7970          ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
7956 7971          ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
7957 7972  
7958 7973          spa->spa_uberblock.ub_version = version;
7959 7974          vdev_config_dirty(spa->spa_root_vdev);
7960 7975  
7961 7976          spa_config_exit(spa, SCL_ALL, FTAG);
7962 7977  
7963 7978          txg_wait_synced(spa_get_dsl(spa), 0);
7964 7979  }
7965 7980  
7966 7981  boolean_t
7967 7982  spa_has_spare(spa_t *spa, uint64_t guid)
7968 7983  {
7969 7984          int i;
7970 7985          uint64_t spareguid;
7971 7986          spa_aux_vdev_t *sav = &spa->spa_spares;
7972 7987  
7973 7988          for (i = 0; i < sav->sav_count; i++)
7974 7989                  if (sav->sav_vdevs[i]->vdev_guid == guid)
7975 7990                          return (B_TRUE);
7976 7991  
7977 7992          for (i = 0; i < sav->sav_npending; i++) {
7978 7993                  if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
7979 7994                      &spareguid) == 0 && spareguid == guid)
7980 7995                          return (B_TRUE);
7981 7996          }
7982 7997  
7983 7998          return (B_FALSE);
7984 7999  }
7985 8000  
7986 8001  /*
7987 8002   * Check if a pool has an active shared spare device.
7988 8003   * Note: reference count of an active spare is 2, as a spare and as a replace
7989 8004   */
7990 8005  static boolean_t
7991 8006  spa_has_active_shared_spare(spa_t *spa)
7992 8007  {
7993 8008          int i, refcnt;
7994 8009          uint64_t pool;
7995 8010          spa_aux_vdev_t *sav = &spa->spa_spares;
7996 8011  
7997 8012          for (i = 0; i < sav->sav_count; i++) {
7998 8013                  if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
7999 8014                      &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
8000 8015                      refcnt > 2)
8001 8016                          return (B_TRUE);
8002 8017          }
8003 8018  
8004 8019          return (B_FALSE);
8005 8020  }
8006 8021  
8007 8022  sysevent_t *
8008 8023  spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
8009 8024  {
8010 8025          sysevent_t              *ev = NULL;
8011 8026  #ifdef _KERNEL
8012 8027          sysevent_attr_list_t    *attr = NULL;
8013 8028          sysevent_value_t        value;
8014 8029  
8015 8030          ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
8016 8031              SE_SLEEP);
8017 8032          ASSERT(ev != NULL);
8018 8033  
8019 8034          value.value_type = SE_DATA_TYPE_STRING;
8020 8035          value.value.sv_string = spa_name(spa);
8021 8036          if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
8022 8037                  goto done;
8023 8038  
8024 8039          value.value_type = SE_DATA_TYPE_UINT64;
8025 8040          value.value.sv_uint64 = spa_guid(spa);
8026 8041          if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
8027 8042                  goto done;
8028 8043  
8029 8044          if (vd) {
8030 8045                  value.value_type = SE_DATA_TYPE_UINT64;
8031 8046                  value.value.sv_uint64 = vd->vdev_guid;
8032 8047                  if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
8033 8048                      SE_SLEEP) != 0)
8034 8049                          goto done;
8035 8050  
8036 8051                  if (vd->vdev_path) {
8037 8052                          value.value_type = SE_DATA_TYPE_STRING;
8038 8053                          value.value.sv_string = vd->vdev_path;
8039 8054                          if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
8040 8055                              &value, SE_SLEEP) != 0)
8041 8056                                  goto done;
8042 8057                  }
8043 8058          }
8044 8059  
8045 8060          if (hist_nvl != NULL) {
8046 8061                  fnvlist_merge((nvlist_t *)attr, hist_nvl);
8047 8062          }
8048 8063  
8049 8064          if (sysevent_attach_attributes(ev, attr) != 0)
8050 8065                  goto done;
8051 8066          attr = NULL;
8052 8067  
8053 8068  done:
8054 8069          if (attr)
8055 8070                  sysevent_free_attr(attr);
8056 8071  
8057 8072  #endif
8058 8073          return (ev);
8059 8074  }
8060 8075  
8061 8076  void
8062 8077  spa_event_post(sysevent_t *ev)
8063 8078  {
8064 8079  #ifdef _KERNEL
8065 8080          sysevent_id_t           eid;
8066 8081  
8067 8082          (void) log_sysevent(ev, SE_SLEEP, &eid);
8068 8083          sysevent_free(ev);
8069 8084  #endif
8070 8085  }
8071 8086  
8072 8087  void
8073 8088  spa_event_discard(sysevent_t *ev)
8074 8089  {
8075 8090  #ifdef _KERNEL
8076 8091          sysevent_free(ev);
8077 8092  #endif
8078 8093  }
8079 8094  
8080 8095  /*
8081 8096   * Post a sysevent corresponding to the given event.  The 'name' must be one of
8082 8097   * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
8083 8098   * filled in from the spa and (optionally) the vdev and history nvl.  This
8084 8099   * doesn't do anything in the userland libzpool, as we don't want consumers to
8085 8100   * misinterpret ztest or zdb as real changes.
8086 8101   */
8087 8102  void
8088 8103  spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
8089 8104  {
8090 8105          spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
8091 8106  }
  
    | 
      ↓ open down ↓ | 
    3471 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX