Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/spa.c
          +++ new/usr/src/uts/common/fs/zfs/spa.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2012 by Delphix. All rights reserved.
  26   26   */
  27   27  
  28   28  /*
  29   29   * This file contains all the routines used when modifying on-disk SPA state.
  30   30   * This includes opening, importing, destroying, exporting a pool, and syncing a
  31   31   * pool.
  32   32   */
  33   33  
  34   34  #include <sys/zfs_context.h>
  35   35  #include <sys/fm/fs/zfs.h>
  36   36  #include <sys/spa_impl.h>
  37   37  #include <sys/zio.h>
  38   38  #include <sys/zio_checksum.h>
  39   39  #include <sys/dmu.h>
  40   40  #include <sys/dmu_tx.h>
  41   41  #include <sys/zap.h>
  42   42  #include <sys/zil.h>
  43   43  #include <sys/ddt.h>
  44   44  #include <sys/vdev_impl.h>
  45   45  #include <sys/metaslab.h>
  46   46  #include <sys/metaslab_impl.h>
  47   47  #include <sys/uberblock_impl.h>
  48   48  #include <sys/txg.h>
  49   49  #include <sys/avl.h>
  50   50  #include <sys/dmu_traverse.h>
  51   51  #include <sys/dmu_objset.h>
  52   52  #include <sys/unique.h>
  53   53  #include <sys/dsl_pool.h>
  54   54  #include <sys/dsl_dataset.h>
  
    | 
      ↓ open down ↓ | 
    54 lines elided | 
    
      ↑ open up ↑ | 
  
  55   55  #include <sys/dsl_dir.h>
  56   56  #include <sys/dsl_prop.h>
  57   57  #include <sys/dsl_synctask.h>
  58   58  #include <sys/fs/zfs.h>
  59   59  #include <sys/arc.h>
  60   60  #include <sys/callb.h>
  61   61  #include <sys/systeminfo.h>
  62   62  #include <sys/spa_boot.h>
  63   63  #include <sys/zfs_ioctl.h>
  64   64  #include <sys/dsl_scan.h>
       65 +#include <sys/zfeature.h>
  65   66  
  66   67  #ifdef  _KERNEL
  67   68  #include <sys/bootprops.h>
  68   69  #include <sys/callb.h>
  69   70  #include <sys/cpupart.h>
  70   71  #include <sys/pool.h>
  71   72  #include <sys/sysdc.h>
  72   73  #include <sys/zone.h>
  73   74  #endif  /* _KERNEL */
  74   75  
  75   76  #include "zfs_prop.h"
  76   77  #include "zfs_comutil.h"
  77   78  
  78   79  typedef enum zti_modes {
  79   80          zti_mode_fixed,                 /* value is # of threads (min 1) */
  80   81          zti_mode_online_percent,        /* value is % of online CPUs */
  81   82          zti_mode_batch,                 /* cpu-intensive; value is ignored */
  82   83          zti_mode_null,                  /* don't create a taskq */
  83   84          zti_nmodes
  84   85  } zti_modes_t;
  85   86  
  86   87  #define ZTI_FIX(n)      { zti_mode_fixed, (n) }
  87   88  #define ZTI_PCT(n)      { zti_mode_online_percent, (n) }
  88   89  #define ZTI_BATCH       { zti_mode_batch, 0 }
  89   90  #define ZTI_NULL        { zti_mode_null, 0 }
  90   91  
  91   92  #define ZTI_ONE         ZTI_FIX(1)
  92   93  
  93   94  typedef struct zio_taskq_info {
  94   95          enum zti_modes zti_mode;
  95   96          uint_t zti_value;
  96   97  } zio_taskq_info_t;
  97   98  
  98   99  static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
  99  100          "issue", "issue_high", "intr", "intr_high"
 100  101  };
 101  102  
 102  103  /*
 103  104   * Define the taskq threads for the following I/O types:
 104  105   *      NULL, READ, WRITE, FREE, CLAIM, and IOCTL
 105  106   */
  
    | 
      ↓ open down ↓ | 
    31 lines elided | 
    
      ↑ open up ↑ | 
  
 106  107  const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 107  108          /* ISSUE        ISSUE_HIGH      INTR            INTR_HIGH */
 108  109          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 109  110          { ZTI_FIX(8),   ZTI_NULL,       ZTI_BATCH,      ZTI_NULL },
 110  111          { ZTI_BATCH,    ZTI_FIX(5),     ZTI_FIX(8),     ZTI_FIX(5) },
 111  112          { ZTI_FIX(100), ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 112  113          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 113  114          { ZTI_ONE,      ZTI_NULL,       ZTI_ONE,        ZTI_NULL },
 114  115  };
 115  116  
      117 +static dsl_syncfunc_t spa_sync_version;
 116  118  static dsl_syncfunc_t spa_sync_props;
 117  119  static boolean_t spa_has_active_shared_spare(spa_t *spa);
 118  120  static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
 119  121      spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
 120  122      char **ereport);
 121  123  static void spa_vdev_resilver_done(spa_t *spa);
 122  124  
 123  125  uint_t          zio_taskq_batch_pct = 100;      /* 1 thread per cpu in pset */
 124  126  id_t            zio_taskq_psrset_bind = PS_NONE;
 125  127  boolean_t       zio_taskq_sysdc = B_TRUE;       /* use SDC scheduling class */
 126  128  uint_t          zio_taskq_basedc = 80;          /* base duty cycle */
 127  129  
 128  130  boolean_t       spa_create_process = B_TRUE;    /* no process ==> no sysdc */
 129  131  
 130  132  /*
 131  133   * This (illegal) pool name is used when temporarily importing a spa_t in order
 132  134   * to get the vdev stats associated with the imported devices.
 133  135   */
 134  136  #define TRYIMPORT_NAME  "$import"
 135  137  
 136  138  /*
 137  139   * ==========================================================================
 138  140   * SPA properties routines
 139  141   * ==========================================================================
 140  142   */
 141  143  
 142  144  /*
 143  145   * Add a (source=src, propname=propval) list to an nvlist.
 144  146   */
 145  147  static void
 146  148  spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 147  149      uint64_t intval, zprop_source_t src)
 148  150  {
 149  151          const char *propname = zpool_prop_to_name(prop);
 150  152          nvlist_t *propval;
 151  153  
 152  154          VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 153  155          VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 154  156  
 155  157          if (strval != NULL)
 156  158                  VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 157  159          else
 158  160                  VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 159  161  
 160  162          VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
  
    | 
      ↓ open down ↓ | 
    35 lines elided | 
    
      ↑ open up ↑ | 
  
 161  163          nvlist_free(propval);
 162  164  }
 163  165  
 164  166  /*
 165  167   * Get property values from the spa configuration.
 166  168   */
 167  169  static void
 168  170  spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 169  171  {
 170  172          vdev_t *rvd = spa->spa_root_vdev;
      173 +        dsl_pool_t *pool = spa->spa_dsl_pool;
 171  174          uint64_t size;
 172  175          uint64_t alloc;
 173  176          uint64_t space;
 174  177          uint64_t cap, version;
 175  178          zprop_source_t src = ZPROP_SRC_NONE;
 176  179          spa_config_dirent_t *dp;
 177  180  
 178  181          ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 179  182  
 180  183          if (rvd != NULL) {
 181  184                  alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 182  185                  size = metaslab_class_get_space(spa_normal_class(spa));
 183  186                  spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 184  187                  spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 185  188                  spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 186  189                  spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 187  190                      size - alloc, src);
 188  191  
 189  192                  space = 0;
 190  193                  for (int c = 0; c < rvd->vdev_children; c++) {
 191  194                          vdev_t *tvd = rvd->vdev_child[c];
 192  195                          space += tvd->vdev_max_asize - tvd->vdev_asize;
 193  196                  }
 194  197                  spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space,
 195  198                      src);
 196  199  
 197  200                  spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 198  201                      (spa_mode(spa) == FREAD), src);
 199  202  
 200  203                  cap = (size == 0) ? 0 : (alloc * 100 / size);
 201  204                  spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 202  205  
 203  206                  spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 204  207                      ddt_get_pool_dedup_ratio(spa), src);
 205  208  
 206  209                  spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
  
    | 
      ↓ open down ↓ | 
    26 lines elided | 
    
      ↑ open up ↑ | 
  
 207  210                      rvd->vdev_state, src);
 208  211  
 209  212                  version = spa_version(spa);
 210  213                  if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 211  214                          src = ZPROP_SRC_DEFAULT;
 212  215                  else
 213  216                          src = ZPROP_SRC_LOCAL;
 214  217                  spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 215  218          }
 216  219  
      220 +        if (pool != NULL) {
      221 +                dsl_dir_t *freedir = pool->dp_free_dir;
      222 +
      223 +                /*
      224 +                 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
      225 +                 * when opening pools before this version freedir will be NULL.
      226 +                 */
      227 +                if (freedir != NULL) {
      228 +                        spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
      229 +                            freedir->dd_phys->dd_used_bytes, src);
      230 +                } else {
      231 +                        spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
      232 +                            NULL, 0, src);
      233 +                }
      234 +        }
      235 +
 217  236          spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 218  237  
 219  238          if (spa->spa_comment != NULL) {
 220  239                  spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 221  240                      0, ZPROP_SRC_LOCAL);
 222  241          }
 223  242  
 224  243          if (spa->spa_root != NULL)
 225  244                  spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 226  245                      0, ZPROP_SRC_LOCAL);
 227  246  
 228  247          if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 229  248                  if (dp->scd_path == NULL) {
 230  249                          spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 231  250                              "none", 0, ZPROP_SRC_LOCAL);
 232  251                  } else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 233  252                          spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 234  253                              dp->scd_path, 0, ZPROP_SRC_LOCAL);
 235  254                  }
 236  255          }
 237  256  }
 238  257  
 239  258  /*
 240  259   * Get zpool property values.
 241  260   */
 242  261  int
 243  262  spa_prop_get(spa_t *spa, nvlist_t **nvp)
 244  263  {
 245  264          objset_t *mos = spa->spa_meta_objset;
 246  265          zap_cursor_t zc;
 247  266          zap_attribute_t za;
 248  267          int err;
 249  268  
 250  269          VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 251  270  
 252  271          mutex_enter(&spa->spa_props_lock);
 253  272  
 254  273          /*
 255  274           * Get properties from the spa config.
 256  275           */
 257  276          spa_prop_get_config(spa, nvp);
 258  277  
 259  278          /* If no pool property object, no more prop to get. */
 260  279          if (mos == NULL || spa->spa_pool_props_object == 0) {
 261  280                  mutex_exit(&spa->spa_props_lock);
 262  281                  return (0);
 263  282          }
 264  283  
 265  284          /*
 266  285           * Get properties from the MOS pool property object.
 267  286           */
 268  287          for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 269  288              (err = zap_cursor_retrieve(&zc, &za)) == 0;
 270  289              zap_cursor_advance(&zc)) {
 271  290                  uint64_t intval = 0;
 272  291                  char *strval = NULL;
 273  292                  zprop_source_t src = ZPROP_SRC_DEFAULT;
 274  293                  zpool_prop_t prop;
 275  294  
 276  295                  if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 277  296                          continue;
 278  297  
 279  298                  switch (za.za_integer_length) {
 280  299                  case 8:
 281  300                          /* integer property */
 282  301                          if (za.za_first_integer !=
 283  302                              zpool_prop_default_numeric(prop))
 284  303                                  src = ZPROP_SRC_LOCAL;
 285  304  
 286  305                          if (prop == ZPOOL_PROP_BOOTFS) {
 287  306                                  dsl_pool_t *dp;
 288  307                                  dsl_dataset_t *ds = NULL;
 289  308  
 290  309                                  dp = spa_get_dsl(spa);
 291  310                                  rw_enter(&dp->dp_config_rwlock, RW_READER);
 292  311                                  if (err = dsl_dataset_hold_obj(dp,
 293  312                                      za.za_first_integer, FTAG, &ds)) {
 294  313                                          rw_exit(&dp->dp_config_rwlock);
 295  314                                          break;
 296  315                                  }
 297  316  
 298  317                                  strval = kmem_alloc(
 299  318                                      MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 300  319                                      KM_SLEEP);
 301  320                                  dsl_dataset_name(ds, strval);
 302  321                                  dsl_dataset_rele(ds, FTAG);
 303  322                                  rw_exit(&dp->dp_config_rwlock);
 304  323                          } else {
 305  324                                  strval = NULL;
 306  325                                  intval = za.za_first_integer;
 307  326                          }
 308  327  
 309  328                          spa_prop_add_list(*nvp, prop, strval, intval, src);
 310  329  
 311  330                          if (strval != NULL)
 312  331                                  kmem_free(strval,
 313  332                                      MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 314  333  
 315  334                          break;
 316  335  
 317  336                  case 1:
 318  337                          /* string property */
 319  338                          strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 320  339                          err = zap_lookup(mos, spa->spa_pool_props_object,
 321  340                              za.za_name, 1, za.za_num_integers, strval);
 322  341                          if (err) {
 323  342                                  kmem_free(strval, za.za_num_integers);
 324  343                                  break;
 325  344                          }
 326  345                          spa_prop_add_list(*nvp, prop, strval, 0, src);
 327  346                          kmem_free(strval, za.za_num_integers);
 328  347                          break;
 329  348  
 330  349                  default:
 331  350                          break;
 332  351                  }
 333  352          }
 334  353          zap_cursor_fini(&zc);
 335  354          mutex_exit(&spa->spa_props_lock);
 336  355  out:
 337  356          if (err && err != ENOENT) {
 338  357                  nvlist_free(*nvp);
 339  358                  *nvp = NULL;
 340  359                  return (err);
 341  360          }
 342  361  
 343  362          return (0);
 344  363  }
 345  364  
  
    | 
      ↓ open down ↓ | 
    119 lines elided | 
    
      ↑ open up ↑ | 
  
 346  365  /*
 347  366   * Validate the given pool properties nvlist and modify the list
 348  367   * for the property values to be set.
 349  368   */
 350  369  static int
 351  370  spa_prop_validate(spa_t *spa, nvlist_t *props)
 352  371  {
 353  372          nvpair_t *elem;
 354  373          int error = 0, reset_bootfs = 0;
 355  374          uint64_t objnum;
      375 +        boolean_t has_feature = B_FALSE;
 356  376  
 357  377          elem = NULL;
 358  378          while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 359      -                zpool_prop_t prop;
 360      -                char *propname, *strval;
 361  379                  uint64_t intval;
 362      -                objset_t *os;
 363      -                char *slash, *check;
      380 +                char *strval, *slash, *check, *fname;
      381 +                const char *propname = nvpair_name(elem);
      382 +                zpool_prop_t prop = zpool_name_to_prop(propname);
 364  383  
 365      -                propname = nvpair_name(elem);
      384 +                switch (prop) {
      385 +                case ZPROP_INVAL:
      386 +                        if (!zpool_prop_feature(propname)) {
      387 +                                error = EINVAL;
      388 +                                break;
      389 +                        }
 366  390  
 367      -                if ((prop = zpool_name_to_prop(propname)) == ZPROP_INVAL)
 368      -                        return (EINVAL);
      391 +                        /*
      392 +                         * Sanitize the input.
      393 +                         */
      394 +                        if (nvpair_type(elem) != DATA_TYPE_UINT64) {
      395 +                                error = EINVAL;
      396 +                                break;
      397 +                        }
 369  398  
 370      -                switch (prop) {
      399 +                        if (nvpair_value_uint64(elem, &intval) != 0) {
      400 +                                error = EINVAL;
      401 +                                break;
      402 +                        }
      403 +
      404 +                        if (intval != 0) {
      405 +                                error = EINVAL;
      406 +                                break;
      407 +                        }
      408 +
      409 +                        fname = strchr(propname, '@') + 1;
      410 +                        if (zfeature_lookup_name(fname, NULL) != 0) {
      411 +                                error = EINVAL;
      412 +                                break;
      413 +                        }
      414 +
      415 +                        has_feature = B_TRUE;
      416 +                        break;
      417 +
 371  418                  case ZPOOL_PROP_VERSION:
 372  419                          error = nvpair_value_uint64(elem, &intval);
 373  420                          if (!error &&
 374      -                            (intval < spa_version(spa) || intval > SPA_VERSION))
      421 +                            (intval < spa_version(spa) ||
      422 +                            intval > SPA_VERSION_BEFORE_FEATURES ||
      423 +                            has_feature))
 375  424                                  error = EINVAL;
 376  425                          break;
 377  426  
 378  427                  case ZPOOL_PROP_DELEGATION:
 379  428                  case ZPOOL_PROP_AUTOREPLACE:
 380  429                  case ZPOOL_PROP_LISTSNAPS:
 381  430                  case ZPOOL_PROP_AUTOEXPAND:
 382  431                          error = nvpair_value_uint64(elem, &intval);
 383  432                          if (!error && intval > 1)
 384  433                                  error = EINVAL;
 385  434                          break;
 386  435  
 387  436                  case ZPOOL_PROP_BOOTFS:
 388  437                          /*
 389  438                           * If the pool version is less than SPA_VERSION_BOOTFS,
 390  439                           * or the pool is still being created (version == 0),
 391  440                           * the bootfs property cannot be set.
 392  441                           */
 393  442                          if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 394  443                                  error = ENOTSUP;
 395  444                                  break;
 396  445                          }
 397  446  
 398  447                          /*
 399  448                           * Make sure the vdev config is bootable
 400  449                           */
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
 401  450                          if (!vdev_is_bootable(spa->spa_root_vdev)) {
 402  451                                  error = ENOTSUP;
 403  452                                  break;
 404  453                          }
 405  454  
 406  455                          reset_bootfs = 1;
 407  456  
 408  457                          error = nvpair_value_string(elem, &strval);
 409  458  
 410  459                          if (!error) {
      460 +                                objset_t *os;
 411  461                                  uint64_t compress;
 412  462  
 413  463                                  if (strval == NULL || strval[0] == '\0') {
 414  464                                          objnum = zpool_prop_default_numeric(
 415  465                                              ZPOOL_PROP_BOOTFS);
 416  466                                          break;
 417  467                                  }
 418  468  
 419  469                                  if (error = dmu_objset_hold(strval, FTAG, &os))
 420  470                                          break;
 421  471  
 422  472                                  /* Must be ZPL and not gzip compressed. */
 423  473  
 424  474                                  if (dmu_objset_type(os) != DMU_OST_ZFS) {
 425  475                                          error = ENOTSUP;
 426  476                                  } else if ((error = dsl_prop_get_integer(strval,
 427  477                                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 428  478                                      &compress, NULL)) == 0 &&
 429  479                                      !BOOTFS_COMPRESS_VALID(compress)) {
 430  480                                          error = ENOTSUP;
 431  481                                  } else {
 432  482                                          objnum = dmu_objset_id(os);
 433  483                                  }
 434  484                                  dmu_objset_rele(os, FTAG);
 435  485                          }
 436  486                          break;
 437  487  
 438  488                  case ZPOOL_PROP_FAILUREMODE:
 439  489                          error = nvpair_value_uint64(elem, &intval);
 440  490                          if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 441  491                              intval > ZIO_FAILURE_MODE_PANIC))
 442  492                                  error = EINVAL;
 443  493  
 444  494                          /*
 445  495                           * This is a special case which only occurs when
 446  496                           * the pool has completely failed. This allows
 447  497                           * the user to change the in-core failmode property
 448  498                           * without syncing it out to disk (I/Os might
 449  499                           * currently be blocked). We do this by returning
 450  500                           * EIO to the caller (spa_prop_set) to trick it
 451  501                           * into thinking we encountered a property validation
 452  502                           * error.
 453  503                           */
 454  504                          if (!error && spa_suspended(spa)) {
 455  505                                  spa->spa_failmode = intval;
 456  506                                  error = EIO;
 457  507                          }
 458  508                          break;
 459  509  
 460  510                  case ZPOOL_PROP_CACHEFILE:
 461  511                          if ((error = nvpair_value_string(elem, &strval)) != 0)
 462  512                                  break;
 463  513  
 464  514                          if (strval[0] == '\0')
 465  515                                  break;
 466  516  
 467  517                          if (strcmp(strval, "none") == 0)
 468  518                                  break;
 469  519  
 470  520                          if (strval[0] != '/') {
 471  521                                  error = EINVAL;
 472  522                                  break;
 473  523                          }
 474  524  
 475  525                          slash = strrchr(strval, '/');
 476  526                          ASSERT(slash != NULL);
 477  527  
 478  528                          if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 479  529                              strcmp(slash, "/..") == 0)
 480  530                                  error = EINVAL;
 481  531                          break;
 482  532  
 483  533                  case ZPOOL_PROP_COMMENT:
 484  534                          if ((error = nvpair_value_string(elem, &strval)) != 0)
 485  535                                  break;
 486  536                          for (check = strval; *check != '\0'; check++) {
 487  537                                  /*
 488  538                                   * The kernel doesn't have an easy isprint()
 489  539                                   * check.  For this kernel check, we merely
 490  540                                   * check ASCII apart from DEL.  Fix this if
 491  541                                   * there is an easy-to-use kernel isprint().
 492  542                                   */
 493  543                                  if (*check >= 0x7f) {
 494  544                                          error = EINVAL;
 495  545                                          break;
 496  546                                  }
 497  547                                  check++;
 498  548                          }
 499  549                          if (strlen(strval) > ZPROP_MAX_COMMENT)
 500  550                                  error = E2BIG;
 501  551                          break;
 502  552  
 503  553                  case ZPOOL_PROP_DEDUPDITTO:
 504  554                          if (spa_version(spa) < SPA_VERSION_DEDUP)
 505  555                                  error = ENOTSUP;
 506  556                          else
 507  557                                  error = nvpair_value_uint64(elem, &intval);
 508  558                          if (error == 0 &&
 509  559                              intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 510  560                                  error = EINVAL;
 511  561                          break;
 512  562                  }
 513  563  
 514  564                  if (error)
 515  565                          break;
 516  566          }
 517  567  
 518  568          if (!error && reset_bootfs) {
 519  569                  error = nvlist_remove(props,
 520  570                      zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 521  571  
 522  572                  if (!error) {
 523  573                          error = nvlist_add_uint64(props,
 524  574                              zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 525  575                  }
 526  576          }
 527  577  
 528  578          return (error);
 529  579  }
 530  580  
 531  581  void
 532  582  spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 533  583  {
 534  584          char *cachefile;
 535  585          spa_config_dirent_t *dp;
 536  586  
 537  587          if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 538  588              &cachefile) != 0)
 539  589                  return;
 540  590  
 541  591          dp = kmem_alloc(sizeof (spa_config_dirent_t),
 542  592              KM_SLEEP);
 543  593  
 544  594          if (cachefile[0] == '\0')
 545  595                  dp->scd_path = spa_strdup(spa_config_path);
 546  596          else if (strcmp(cachefile, "none") == 0)
 547  597                  dp->scd_path = NULL;
 548  598          else
 549  599                  dp->scd_path = spa_strdup(cachefile);
  
    | 
      ↓ open down ↓ | 
    129 lines elided | 
    
      ↑ open up ↑ | 
  
 550  600  
 551  601          list_insert_head(&spa->spa_config_list, dp);
 552  602          if (need_sync)
 553  603                  spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 554  604  }
 555  605  
 556  606  int
 557  607  spa_prop_set(spa_t *spa, nvlist_t *nvp)
 558  608  {
 559  609          int error;
 560      -        nvpair_t *elem;
      610 +        nvpair_t *elem = NULL;
 561  611          boolean_t need_sync = B_FALSE;
 562      -        zpool_prop_t prop;
 563  612  
 564  613          if ((error = spa_prop_validate(spa, nvp)) != 0)
 565  614                  return (error);
 566  615  
 567      -        elem = NULL;
 568  616          while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 569      -                if ((prop = zpool_name_to_prop(
 570      -                    nvpair_name(elem))) == ZPROP_INVAL)
 571      -                        return (EINVAL);
      617 +                zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 572  618  
 573  619                  if (prop == ZPOOL_PROP_CACHEFILE ||
 574  620                      prop == ZPOOL_PROP_ALTROOT ||
 575  621                      prop == ZPOOL_PROP_READONLY)
 576  622                          continue;
 577  623  
      624 +                if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
      625 +                        uint64_t ver;
      626 +
      627 +                        if (prop == ZPOOL_PROP_VERSION) {
      628 +                                VERIFY(nvpair_value_uint64(elem, &ver) == 0);
      629 +                        } else {
      630 +                                ASSERT(zpool_prop_feature(nvpair_name(elem)));
      631 +                                ver = SPA_VERSION_FEATURES;
      632 +                                need_sync = B_TRUE;
      633 +                        }
      634 +
      635 +                        /* Save time if the version is already set. */
      636 +                        if (ver == spa_version(spa))
      637 +                                continue;
      638 +
      639 +                        /*
      640 +                         * In addition to the pool directory object, we might
      641 +                         * create the pool properties object, the features for
      642 +                         * read object, the features for write object, or the
      643 +                         * feature descriptions object.
      644 +                         */
      645 +                        error = dsl_sync_task_do(spa_get_dsl(spa), NULL,
      646 +                            spa_sync_version, spa, &ver, 6);
      647 +                        if (error)
      648 +                                return (error);
      649 +                        continue;
      650 +                }
      651 +
 578  652                  need_sync = B_TRUE;
 579  653                  break;
 580  654          }
 581  655  
 582      -        if (need_sync)
      656 +        if (need_sync) {
 583  657                  return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
 584      -                    spa, nvp, 3));
 585      -        else
 586      -                return (0);
      658 +                    spa, nvp, 6));
      659 +        }
      660 +
      661 +        return (0);
 587  662  }
 588  663  
 589  664  /*
 590  665   * If the bootfs property value is dsobj, clear it.
 591  666   */
 592  667  void
 593  668  spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 594  669  {
 595  670          if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 596  671                  VERIFY(zap_remove(spa->spa_meta_objset,
 597  672                      spa->spa_pool_props_object,
 598  673                      zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 599  674                  spa->spa_bootfs = 0;
 600  675          }
 601  676  }
 602  677  
 603  678  /*
 604  679   * Change the GUID for the pool.  This is done so that we can later
 605  680   * re-import a pool built from a clone of our own vdevs.  We will modify
 606  681   * the root vdev's guid, our own pool guid, and then mark all of our
 607  682   * vdevs dirty.  Note that we must make sure that all our vdevs are
 608  683   * online when we do this, or else any vdevs that weren't present
 609  684   * would be orphaned from our pool.  We are also going to issue a
 610  685   * sysevent to update any watchers.
 611  686   */
 612  687  int
 613  688  spa_change_guid(spa_t *spa)
 614  689  {
 615  690          uint64_t        oldguid, newguid;
 616  691          uint64_t        txg;
 617  692  
 618  693          if (!(spa_mode_global & FWRITE))
 619  694                  return (EROFS);
 620  695  
 621  696          txg = spa_vdev_enter(spa);
 622  697  
 623  698          if (spa->spa_root_vdev->vdev_state != VDEV_STATE_HEALTHY)
 624  699                  return (spa_vdev_exit(spa, NULL, txg, ENXIO));
 625  700  
 626  701          oldguid = spa_guid(spa);
 627  702          newguid = spa_generate_guid(NULL);
 628  703          ASSERT3U(oldguid, !=, newguid);
 629  704  
 630  705          spa->spa_root_vdev->vdev_guid = newguid;
 631  706          spa->spa_root_vdev->vdev_guid_sum += (newguid - oldguid);
 632  707  
 633  708          vdev_config_dirty(spa->spa_root_vdev);
 634  709  
 635  710          spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 636  711  
 637  712          return (spa_vdev_exit(spa, NULL, txg, 0));
 638  713  }
 639  714  
 640  715  /*
 641  716   * ==========================================================================
 642  717   * SPA state manipulation (open/create/destroy/import/export)
 643  718   * ==========================================================================
 644  719   */
 645  720  
 646  721  static int
 647  722  spa_error_entry_compare(const void *a, const void *b)
 648  723  {
 649  724          spa_error_entry_t *sa = (spa_error_entry_t *)a;
 650  725          spa_error_entry_t *sb = (spa_error_entry_t *)b;
 651  726          int ret;
 652  727  
 653  728          ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 654  729              sizeof (zbookmark_t));
 655  730  
 656  731          if (ret < 0)
 657  732                  return (-1);
 658  733          else if (ret > 0)
 659  734                  return (1);
 660  735          else
 661  736                  return (0);
 662  737  }
 663  738  
 664  739  /*
 665  740   * Utility function which retrieves copies of the current logs and
 666  741   * re-initializes them in the process.
 667  742   */
 668  743  void
 669  744  spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 670  745  {
 671  746          ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 672  747  
 673  748          bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 674  749          bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 675  750  
 676  751          avl_create(&spa->spa_errlist_scrub,
 677  752              spa_error_entry_compare, sizeof (spa_error_entry_t),
 678  753              offsetof(spa_error_entry_t, se_avl));
 679  754          avl_create(&spa->spa_errlist_last,
 680  755              spa_error_entry_compare, sizeof (spa_error_entry_t),
 681  756              offsetof(spa_error_entry_t, se_avl));
 682  757  }
 683  758  
 684  759  static taskq_t *
 685  760  spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
 686  761      uint_t value)
 687  762  {
 688  763          uint_t flags = 0;
 689  764          boolean_t batch = B_FALSE;
 690  765  
 691  766          switch (mode) {
 692  767          case zti_mode_null:
 693  768                  return (NULL);          /* no taskq needed */
 694  769  
 695  770          case zti_mode_fixed:
 696  771                  ASSERT3U(value, >=, 1);
 697  772                  value = MAX(value, 1);
 698  773                  break;
 699  774  
 700  775          case zti_mode_batch:
 701  776                  batch = B_TRUE;
 702  777                  flags |= TASKQ_THREADS_CPU_PCT;
 703  778                  value = zio_taskq_batch_pct;
 704  779                  break;
 705  780  
 706  781          case zti_mode_online_percent:
 707  782                  flags |= TASKQ_THREADS_CPU_PCT;
 708  783                  break;
 709  784  
 710  785          default:
 711  786                  panic("unrecognized mode for %s taskq (%u:%u) in "
 712  787                      "spa_activate()",
 713  788                      name, mode, value);
 714  789                  break;
 715  790          }
 716  791  
 717  792          if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 718  793                  if (batch)
 719  794                          flags |= TASKQ_DC_BATCH;
 720  795  
 721  796                  return (taskq_create_sysdc(name, value, 50, INT_MAX,
 722  797                      spa->spa_proc, zio_taskq_basedc, flags));
 723  798          }
 724  799          return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
 725  800              spa->spa_proc, flags));
 726  801  }
 727  802  
 728  803  static void
 729  804  spa_create_zio_taskqs(spa_t *spa)
 730  805  {
 731  806          for (int t = 0; t < ZIO_TYPES; t++) {
 732  807                  for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 733  808                          const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 734  809                          enum zti_modes mode = ztip->zti_mode;
 735  810                          uint_t value = ztip->zti_value;
 736  811                          char name[32];
 737  812  
 738  813                          (void) snprintf(name, sizeof (name),
 739  814                              "%s_%s", zio_type_name[t], zio_taskq_types[q]);
 740  815  
 741  816                          spa->spa_zio_taskq[t][q] =
 742  817                              spa_taskq_create(spa, name, mode, value);
 743  818                  }
 744  819          }
 745  820  }
 746  821  
 747  822  #ifdef _KERNEL
 748  823  static void
 749  824  spa_thread(void *arg)
 750  825  {
 751  826          callb_cpr_t cprinfo;
 752  827  
 753  828          spa_t *spa = arg;
 754  829          user_t *pu = PTOU(curproc);
 755  830  
 756  831          CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 757  832              spa->spa_name);
 758  833  
 759  834          ASSERT(curproc != &p0);
 760  835          (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 761  836              "zpool-%s", spa->spa_name);
 762  837          (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 763  838  
 764  839          /* bind this thread to the requested psrset */
 765  840          if (zio_taskq_psrset_bind != PS_NONE) {
 766  841                  pool_lock();
 767  842                  mutex_enter(&cpu_lock);
 768  843                  mutex_enter(&pidlock);
 769  844                  mutex_enter(&curproc->p_lock);
 770  845  
 771  846                  if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 772  847                      0, NULL, NULL) == 0)  {
 773  848                          curthread->t_bind_pset = zio_taskq_psrset_bind;
 774  849                  } else {
 775  850                          cmn_err(CE_WARN,
 776  851                              "Couldn't bind process for zfs pool \"%s\" to "
 777  852                              "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 778  853                  }
 779  854  
 780  855                  mutex_exit(&curproc->p_lock);
 781  856                  mutex_exit(&pidlock);
 782  857                  mutex_exit(&cpu_lock);
 783  858                  pool_unlock();
 784  859          }
 785  860  
 786  861          if (zio_taskq_sysdc) {
 787  862                  sysdc_thread_enter(curthread, 100, 0);
 788  863          }
 789  864  
 790  865          spa->spa_proc = curproc;
 791  866          spa->spa_did = curthread->t_did;
 792  867  
 793  868          spa_create_zio_taskqs(spa);
 794  869  
 795  870          mutex_enter(&spa->spa_proc_lock);
 796  871          ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 797  872  
 798  873          spa->spa_proc_state = SPA_PROC_ACTIVE;
 799  874          cv_broadcast(&spa->spa_proc_cv);
 800  875  
 801  876          CALLB_CPR_SAFE_BEGIN(&cprinfo);
 802  877          while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 803  878                  cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 804  879          CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 805  880  
 806  881          ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 807  882          spa->spa_proc_state = SPA_PROC_GONE;
 808  883          spa->spa_proc = &p0;
 809  884          cv_broadcast(&spa->spa_proc_cv);
 810  885          CALLB_CPR_EXIT(&cprinfo);       /* drops spa_proc_lock */
 811  886  
 812  887          mutex_enter(&curproc->p_lock);
 813  888          lwp_exit();
 814  889  }
 815  890  #endif
 816  891  
 817  892  /*
 818  893   * Activate an uninitialized pool.
 819  894   */
 820  895  static void
 821  896  spa_activate(spa_t *spa, int mode)
 822  897  {
 823  898          ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 824  899  
 825  900          spa->spa_state = POOL_STATE_ACTIVE;
 826  901          spa->spa_mode = mode;
 827  902  
 828  903          spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 829  904          spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 830  905  
 831  906          /* Try to create a covering process */
 832  907          mutex_enter(&spa->spa_proc_lock);
 833  908          ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 834  909          ASSERT(spa->spa_proc == &p0);
 835  910          spa->spa_did = 0;
 836  911  
 837  912          /* Only create a process if we're going to be around a while. */
 838  913          if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 839  914                  if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 840  915                      NULL, 0) == 0) {
 841  916                          spa->spa_proc_state = SPA_PROC_CREATED;
 842  917                          while (spa->spa_proc_state == SPA_PROC_CREATED) {
 843  918                                  cv_wait(&spa->spa_proc_cv,
 844  919                                      &spa->spa_proc_lock);
 845  920                          }
 846  921                          ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 847  922                          ASSERT(spa->spa_proc != &p0);
 848  923                          ASSERT(spa->spa_did != 0);
 849  924                  } else {
 850  925  #ifdef _KERNEL
 851  926                          cmn_err(CE_WARN,
 852  927                              "Couldn't create process for zfs pool \"%s\"\n",
 853  928                              spa->spa_name);
 854  929  #endif
 855  930                  }
 856  931          }
 857  932          mutex_exit(&spa->spa_proc_lock);
 858  933  
 859  934          /* If we didn't create a process, we need to create our taskqs. */
 860  935          if (spa->spa_proc == &p0) {
 861  936                  spa_create_zio_taskqs(spa);
 862  937          }
 863  938  
 864  939          list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 865  940              offsetof(vdev_t, vdev_config_dirty_node));
 866  941          list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 867  942              offsetof(vdev_t, vdev_state_dirty_node));
 868  943  
 869  944          txg_list_create(&spa->spa_vdev_txg_list,
 870  945              offsetof(struct vdev, vdev_txg_node));
 871  946  
 872  947          avl_create(&spa->spa_errlist_scrub,
 873  948              spa_error_entry_compare, sizeof (spa_error_entry_t),
 874  949              offsetof(spa_error_entry_t, se_avl));
 875  950          avl_create(&spa->spa_errlist_last,
 876  951              spa_error_entry_compare, sizeof (spa_error_entry_t),
 877  952              offsetof(spa_error_entry_t, se_avl));
 878  953  }
 879  954  
 880  955  /*
 881  956   * Opposite of spa_activate().
 882  957   */
 883  958  static void
 884  959  spa_deactivate(spa_t *spa)
 885  960  {
 886  961          ASSERT(spa->spa_sync_on == B_FALSE);
 887  962          ASSERT(spa->spa_dsl_pool == NULL);
 888  963          ASSERT(spa->spa_root_vdev == NULL);
 889  964          ASSERT(spa->spa_async_zio_root == NULL);
 890  965          ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 891  966  
 892  967          txg_list_destroy(&spa->spa_vdev_txg_list);
 893  968  
 894  969          list_destroy(&spa->spa_config_dirty_list);
 895  970          list_destroy(&spa->spa_state_dirty_list);
 896  971  
 897  972          for (int t = 0; t < ZIO_TYPES; t++) {
 898  973                  for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 899  974                          if (spa->spa_zio_taskq[t][q] != NULL)
 900  975                                  taskq_destroy(spa->spa_zio_taskq[t][q]);
 901  976                          spa->spa_zio_taskq[t][q] = NULL;
 902  977                  }
 903  978          }
 904  979  
 905  980          metaslab_class_destroy(spa->spa_normal_class);
 906  981          spa->spa_normal_class = NULL;
 907  982  
 908  983          metaslab_class_destroy(spa->spa_log_class);
 909  984          spa->spa_log_class = NULL;
 910  985  
 911  986          /*
 912  987           * If this was part of an import or the open otherwise failed, we may
 913  988           * still have errors left in the queues.  Empty them just in case.
 914  989           */
 915  990          spa_errlog_drain(spa);
 916  991  
 917  992          avl_destroy(&spa->spa_errlist_scrub);
 918  993          avl_destroy(&spa->spa_errlist_last);
 919  994  
 920  995          spa->spa_state = POOL_STATE_UNINITIALIZED;
 921  996  
 922  997          mutex_enter(&spa->spa_proc_lock);
 923  998          if (spa->spa_proc_state != SPA_PROC_NONE) {
 924  999                  ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 925 1000                  spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 926 1001                  cv_broadcast(&spa->spa_proc_cv);
 927 1002                  while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 928 1003                          ASSERT(spa->spa_proc != &p0);
 929 1004                          cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 930 1005                  }
 931 1006                  ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 932 1007                  spa->spa_proc_state = SPA_PROC_NONE;
 933 1008          }
 934 1009          ASSERT(spa->spa_proc == &p0);
 935 1010          mutex_exit(&spa->spa_proc_lock);
 936 1011  
 937 1012          /*
 938 1013           * We want to make sure spa_thread() has actually exited the ZFS
 939 1014           * module, so that the module can't be unloaded out from underneath
 940 1015           * it.
 941 1016           */
 942 1017          if (spa->spa_did != 0) {
 943 1018                  thread_join(spa->spa_did);
 944 1019                  spa->spa_did = 0;
 945 1020          }
 946 1021  }
 947 1022  
 948 1023  /*
 949 1024   * Verify a pool configuration, and construct the vdev tree appropriately.  This
 950 1025   * will create all the necessary vdevs in the appropriate layout, with each vdev
 951 1026   * in the CLOSED state.  This will prep the pool before open/creation/import.
 952 1027   * All vdev validation is done by the vdev_alloc() routine.
 953 1028   */
 954 1029  static int
 955 1030  spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
 956 1031      uint_t id, int atype)
 957 1032  {
 958 1033          nvlist_t **child;
 959 1034          uint_t children;
 960 1035          int error;
 961 1036  
 962 1037          if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 963 1038                  return (error);
 964 1039  
 965 1040          if ((*vdp)->vdev_ops->vdev_op_leaf)
 966 1041                  return (0);
 967 1042  
 968 1043          error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 969 1044              &child, &children);
 970 1045  
 971 1046          if (error == ENOENT)
 972 1047                  return (0);
 973 1048  
 974 1049          if (error) {
 975 1050                  vdev_free(*vdp);
 976 1051                  *vdp = NULL;
 977 1052                  return (EINVAL);
 978 1053          }
 979 1054  
 980 1055          for (int c = 0; c < children; c++) {
 981 1056                  vdev_t *vd;
 982 1057                  if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 983 1058                      atype)) != 0) {
 984 1059                          vdev_free(*vdp);
 985 1060                          *vdp = NULL;
 986 1061                          return (error);
 987 1062                  }
 988 1063          }
 989 1064  
 990 1065          ASSERT(*vdp != NULL);
 991 1066  
 992 1067          return (0);
 993 1068  }
 994 1069  
 995 1070  /*
 996 1071   * Opposite of spa_load().
 997 1072   */
 998 1073  static void
 999 1074  spa_unload(spa_t *spa)
1000 1075  {
1001 1076          int i;
1002 1077  
1003 1078          ASSERT(MUTEX_HELD(&spa_namespace_lock));
1004 1079  
1005 1080          /*
1006 1081           * Stop async tasks.
1007 1082           */
1008 1083          spa_async_suspend(spa);
1009 1084  
1010 1085          /*
1011 1086           * Stop syncing.
1012 1087           */
1013 1088          if (spa->spa_sync_on) {
1014 1089                  txg_sync_stop(spa->spa_dsl_pool);
1015 1090                  spa->spa_sync_on = B_FALSE;
1016 1091          }
1017 1092  
1018 1093          /*
1019 1094           * Wait for any outstanding async I/O to complete.
1020 1095           */
1021 1096          if (spa->spa_async_zio_root != NULL) {
1022 1097                  (void) zio_wait(spa->spa_async_zio_root);
1023 1098                  spa->spa_async_zio_root = NULL;
1024 1099          }
1025 1100  
1026 1101          bpobj_close(&spa->spa_deferred_bpobj);
1027 1102  
1028 1103          /*
1029 1104           * Close the dsl pool.
1030 1105           */
1031 1106          if (spa->spa_dsl_pool) {
1032 1107                  dsl_pool_close(spa->spa_dsl_pool);
1033 1108                  spa->spa_dsl_pool = NULL;
1034 1109                  spa->spa_meta_objset = NULL;
1035 1110          }
1036 1111  
1037 1112          ddt_unload(spa);
1038 1113  
1039 1114          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1040 1115  
1041 1116          /*
1042 1117           * Drop and purge level 2 cache
1043 1118           */
1044 1119          spa_l2cache_drop(spa);
1045 1120  
1046 1121          /*
1047 1122           * Close all vdevs.
1048 1123           */
1049 1124          if (spa->spa_root_vdev)
1050 1125                  vdev_free(spa->spa_root_vdev);
1051 1126          ASSERT(spa->spa_root_vdev == NULL);
1052 1127  
1053 1128          for (i = 0; i < spa->spa_spares.sav_count; i++)
1054 1129                  vdev_free(spa->spa_spares.sav_vdevs[i]);
1055 1130          if (spa->spa_spares.sav_vdevs) {
1056 1131                  kmem_free(spa->spa_spares.sav_vdevs,
1057 1132                      spa->spa_spares.sav_count * sizeof (void *));
1058 1133                  spa->spa_spares.sav_vdevs = NULL;
1059 1134          }
1060 1135          if (spa->spa_spares.sav_config) {
1061 1136                  nvlist_free(spa->spa_spares.sav_config);
1062 1137                  spa->spa_spares.sav_config = NULL;
1063 1138          }
1064 1139          spa->spa_spares.sav_count = 0;
1065 1140  
1066 1141          for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
1067 1142                  vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
1068 1143                  vdev_free(spa->spa_l2cache.sav_vdevs[i]);
1069 1144          }
1070 1145          if (spa->spa_l2cache.sav_vdevs) {
1071 1146                  kmem_free(spa->spa_l2cache.sav_vdevs,
1072 1147                      spa->spa_l2cache.sav_count * sizeof (void *));
1073 1148                  spa->spa_l2cache.sav_vdevs = NULL;
1074 1149          }
1075 1150          if (spa->spa_l2cache.sav_config) {
1076 1151                  nvlist_free(spa->spa_l2cache.sav_config);
1077 1152                  spa->spa_l2cache.sav_config = NULL;
1078 1153          }
1079 1154          spa->spa_l2cache.sav_count = 0;
1080 1155  
1081 1156          spa->spa_async_suspended = 0;
1082 1157  
1083 1158          if (spa->spa_comment != NULL) {
1084 1159                  spa_strfree(spa->spa_comment);
1085 1160                  spa->spa_comment = NULL;
1086 1161          }
1087 1162  
1088 1163          spa_config_exit(spa, SCL_ALL, FTAG);
1089 1164  }
1090 1165  
1091 1166  /*
1092 1167   * Load (or re-load) the current list of vdevs describing the active spares for
1093 1168   * this pool.  When this is called, we have some form of basic information in
1094 1169   * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
1095 1170   * then re-generate a more complete list including status information.
1096 1171   */
1097 1172  static void
1098 1173  spa_load_spares(spa_t *spa)
1099 1174  {
1100 1175          nvlist_t **spares;
1101 1176          uint_t nspares;
1102 1177          int i;
1103 1178          vdev_t *vd, *tvd;
1104 1179  
1105 1180          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1106 1181  
1107 1182          /*
1108 1183           * First, close and free any existing spare vdevs.
1109 1184           */
1110 1185          for (i = 0; i < spa->spa_spares.sav_count; i++) {
1111 1186                  vd = spa->spa_spares.sav_vdevs[i];
1112 1187  
1113 1188                  /* Undo the call to spa_activate() below */
1114 1189                  if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1115 1190                      B_FALSE)) != NULL && tvd->vdev_isspare)
1116 1191                          spa_spare_remove(tvd);
1117 1192                  vdev_close(vd);
1118 1193                  vdev_free(vd);
1119 1194          }
1120 1195  
1121 1196          if (spa->spa_spares.sav_vdevs)
1122 1197                  kmem_free(spa->spa_spares.sav_vdevs,
1123 1198                      spa->spa_spares.sav_count * sizeof (void *));
1124 1199  
1125 1200          if (spa->spa_spares.sav_config == NULL)
1126 1201                  nspares = 0;
1127 1202          else
1128 1203                  VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
1129 1204                      ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
1130 1205  
1131 1206          spa->spa_spares.sav_count = (int)nspares;
1132 1207          spa->spa_spares.sav_vdevs = NULL;
1133 1208  
1134 1209          if (nspares == 0)
1135 1210                  return;
1136 1211  
1137 1212          /*
1138 1213           * Construct the array of vdevs, opening them to get status in the
1139 1214           * process.   For each spare, there is potentially two different vdev_t
1140 1215           * structures associated with it: one in the list of spares (used only
1141 1216           * for basic validation purposes) and one in the active vdev
1142 1217           * configuration (if it's spared in).  During this phase we open and
1143 1218           * validate each vdev on the spare list.  If the vdev also exists in the
1144 1219           * active configuration, then we also mark this vdev as an active spare.
1145 1220           */
1146 1221          spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
1147 1222              KM_SLEEP);
1148 1223          for (i = 0; i < spa->spa_spares.sav_count; i++) {
1149 1224                  VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
1150 1225                      VDEV_ALLOC_SPARE) == 0);
1151 1226                  ASSERT(vd != NULL);
1152 1227  
1153 1228                  spa->spa_spares.sav_vdevs[i] = vd;
1154 1229  
1155 1230                  if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
1156 1231                      B_FALSE)) != NULL) {
1157 1232                          if (!tvd->vdev_isspare)
1158 1233                                  spa_spare_add(tvd);
1159 1234  
1160 1235                          /*
1161 1236                           * We only mark the spare active if we were successfully
1162 1237                           * able to load the vdev.  Otherwise, importing a pool
1163 1238                           * with a bad active spare would result in strange
1164 1239                           * behavior, because multiple pool would think the spare
1165 1240                           * is actively in use.
1166 1241                           *
1167 1242                           * There is a vulnerability here to an equally bizarre
1168 1243                           * circumstance, where a dead active spare is later
1169 1244                           * brought back to life (onlined or otherwise).  Given
1170 1245                           * the rarity of this scenario, and the extra complexity
1171 1246                           * it adds, we ignore the possibility.
1172 1247                           */
1173 1248                          if (!vdev_is_dead(tvd))
1174 1249                                  spa_spare_activate(tvd);
1175 1250                  }
1176 1251  
1177 1252                  vd->vdev_top = vd;
1178 1253                  vd->vdev_aux = &spa->spa_spares;
1179 1254  
1180 1255                  if (vdev_open(vd) != 0)
1181 1256                          continue;
1182 1257  
1183 1258                  if (vdev_validate_aux(vd) == 0)
1184 1259                          spa_spare_add(vd);
1185 1260          }
1186 1261  
1187 1262          /*
1188 1263           * Recompute the stashed list of spares, with status information
1189 1264           * this time.
1190 1265           */
1191 1266          VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
1192 1267              DATA_TYPE_NVLIST_ARRAY) == 0);
1193 1268  
1194 1269          spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
1195 1270              KM_SLEEP);
1196 1271          for (i = 0; i < spa->spa_spares.sav_count; i++)
1197 1272                  spares[i] = vdev_config_generate(spa,
1198 1273                      spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
1199 1274          VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
1200 1275              ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
1201 1276          for (i = 0; i < spa->spa_spares.sav_count; i++)
1202 1277                  nvlist_free(spares[i]);
1203 1278          kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
1204 1279  }
1205 1280  
1206 1281  /*
1207 1282   * Load (or re-load) the current list of vdevs describing the active l2cache for
1208 1283   * this pool.  When this is called, we have some form of basic information in
1209 1284   * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
1210 1285   * then re-generate a more complete list including status information.
1211 1286   * Devices which are already active have their details maintained, and are
1212 1287   * not re-opened.
1213 1288   */
1214 1289  static void
1215 1290  spa_load_l2cache(spa_t *spa)
1216 1291  {
1217 1292          nvlist_t **l2cache;
1218 1293          uint_t nl2cache;
1219 1294          int i, j, oldnvdevs;
1220 1295          uint64_t guid;
1221 1296          vdev_t *vd, **oldvdevs, **newvdevs;
1222 1297          spa_aux_vdev_t *sav = &spa->spa_l2cache;
1223 1298  
1224 1299          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
1225 1300  
1226 1301          if (sav->sav_config != NULL) {
1227 1302                  VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
1228 1303                      ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
1229 1304                  newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
1230 1305          } else {
1231 1306                  nl2cache = 0;
1232 1307          }
1233 1308  
1234 1309          oldvdevs = sav->sav_vdevs;
1235 1310          oldnvdevs = sav->sav_count;
1236 1311          sav->sav_vdevs = NULL;
1237 1312          sav->sav_count = 0;
1238 1313  
1239 1314          /*
1240 1315           * Process new nvlist of vdevs.
1241 1316           */
1242 1317          for (i = 0; i < nl2cache; i++) {
1243 1318                  VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
1244 1319                      &guid) == 0);
1245 1320  
1246 1321                  newvdevs[i] = NULL;
1247 1322                  for (j = 0; j < oldnvdevs; j++) {
1248 1323                          vd = oldvdevs[j];
1249 1324                          if (vd != NULL && guid == vd->vdev_guid) {
1250 1325                                  /*
1251 1326                                   * Retain previous vdev for add/remove ops.
1252 1327                                   */
1253 1328                                  newvdevs[i] = vd;
1254 1329                                  oldvdevs[j] = NULL;
1255 1330                                  break;
1256 1331                          }
1257 1332                  }
1258 1333  
1259 1334                  if (newvdevs[i] == NULL) {
1260 1335                          /*
1261 1336                           * Create new vdev
1262 1337                           */
1263 1338                          VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
1264 1339                              VDEV_ALLOC_L2CACHE) == 0);
1265 1340                          ASSERT(vd != NULL);
1266 1341                          newvdevs[i] = vd;
1267 1342  
1268 1343                          /*
1269 1344                           * Commit this vdev as an l2cache device,
1270 1345                           * even if it fails to open.
1271 1346                           */
1272 1347                          spa_l2cache_add(vd);
1273 1348  
1274 1349                          vd->vdev_top = vd;
1275 1350                          vd->vdev_aux = sav;
1276 1351  
1277 1352                          spa_l2cache_activate(vd);
1278 1353  
1279 1354                          if (vdev_open(vd) != 0)
1280 1355                                  continue;
1281 1356  
1282 1357                          (void) vdev_validate_aux(vd);
1283 1358  
1284 1359                          if (!vdev_is_dead(vd))
1285 1360                                  l2arc_add_vdev(spa, vd);
1286 1361                  }
1287 1362          }
1288 1363  
1289 1364          /*
1290 1365           * Purge vdevs that were dropped
1291 1366           */
1292 1367          for (i = 0; i < oldnvdevs; i++) {
1293 1368                  uint64_t pool;
1294 1369  
1295 1370                  vd = oldvdevs[i];
1296 1371                  if (vd != NULL) {
1297 1372                          ASSERT(vd->vdev_isl2cache);
1298 1373  
1299 1374                          if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
1300 1375                              pool != 0ULL && l2arc_vdev_present(vd))
1301 1376                                  l2arc_remove_vdev(vd);
1302 1377                          vdev_clear_stats(vd);
1303 1378                          vdev_free(vd);
1304 1379                  }
1305 1380          }
1306 1381  
1307 1382          if (oldvdevs)
1308 1383                  kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
1309 1384  
1310 1385          if (sav->sav_config == NULL)
1311 1386                  goto out;
1312 1387  
1313 1388          sav->sav_vdevs = newvdevs;
1314 1389          sav->sav_count = (int)nl2cache;
1315 1390  
1316 1391          /*
1317 1392           * Recompute the stashed list of l2cache devices, with status
1318 1393           * information this time.
1319 1394           */
1320 1395          VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
1321 1396              DATA_TYPE_NVLIST_ARRAY) == 0);
1322 1397  
1323 1398          l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
1324 1399          for (i = 0; i < sav->sav_count; i++)
1325 1400                  l2cache[i] = vdev_config_generate(spa,
1326 1401                      sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
1327 1402          VERIFY(nvlist_add_nvlist_array(sav->sav_config,
1328 1403              ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
1329 1404  out:
1330 1405          for (i = 0; i < sav->sav_count; i++)
1331 1406                  nvlist_free(l2cache[i]);
1332 1407          if (sav->sav_count)
1333 1408                  kmem_free(l2cache, sav->sav_count * sizeof (void *));
1334 1409  }
1335 1410  
1336 1411  static int
1337 1412  load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
1338 1413  {
1339 1414          dmu_buf_t *db;
1340 1415          char *packed = NULL;
1341 1416          size_t nvsize = 0;
1342 1417          int error;
1343 1418          *value = NULL;
1344 1419  
1345 1420          VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
1346 1421          nvsize = *(uint64_t *)db->db_data;
1347 1422          dmu_buf_rele(db, FTAG);
1348 1423  
1349 1424          packed = kmem_alloc(nvsize, KM_SLEEP);
1350 1425          error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
1351 1426              DMU_READ_PREFETCH);
1352 1427          if (error == 0)
1353 1428                  error = nvlist_unpack(packed, nvsize, value, 0);
1354 1429          kmem_free(packed, nvsize);
1355 1430  
1356 1431          return (error);
1357 1432  }
1358 1433  
1359 1434  /*
1360 1435   * Checks to see if the given vdev could not be opened, in which case we post a
1361 1436   * sysevent to notify the autoreplace code that the device has been removed.
1362 1437   */
1363 1438  static void
1364 1439  spa_check_removed(vdev_t *vd)
1365 1440  {
1366 1441          for (int c = 0; c < vd->vdev_children; c++)
1367 1442                  spa_check_removed(vd->vdev_child[c]);
1368 1443  
1369 1444          if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
1370 1445                  zfs_post_autoreplace(vd->vdev_spa, vd);
1371 1446                  spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
1372 1447          }
1373 1448  }
1374 1449  
1375 1450  /*
1376 1451   * Validate the current config against the MOS config
1377 1452   */
1378 1453  static boolean_t
1379 1454  spa_config_valid(spa_t *spa, nvlist_t *config)
1380 1455  {
1381 1456          vdev_t *mrvd, *rvd = spa->spa_root_vdev;
1382 1457          nvlist_t *nv;
1383 1458  
1384 1459          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
1385 1460  
1386 1461          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1387 1462          VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
1388 1463  
1389 1464          ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
1390 1465  
1391 1466          /*
1392 1467           * If we're doing a normal import, then build up any additional
1393 1468           * diagnostic information about missing devices in this config.
1394 1469           * We'll pass this up to the user for further processing.
1395 1470           */
1396 1471          if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
1397 1472                  nvlist_t **child, *nv;
1398 1473                  uint64_t idx = 0;
1399 1474  
1400 1475                  child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
1401 1476                      KM_SLEEP);
1402 1477                  VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
1403 1478  
1404 1479                  for (int c = 0; c < rvd->vdev_children; c++) {
1405 1480                          vdev_t *tvd = rvd->vdev_child[c];
1406 1481                          vdev_t *mtvd  = mrvd->vdev_child[c];
1407 1482  
1408 1483                          if (tvd->vdev_ops == &vdev_missing_ops &&
1409 1484                              mtvd->vdev_ops != &vdev_missing_ops &&
1410 1485                              mtvd->vdev_islog)
1411 1486                                  child[idx++] = vdev_config_generate(spa, mtvd,
1412 1487                                      B_FALSE, 0);
1413 1488                  }
1414 1489  
1415 1490                  if (idx) {
1416 1491                          VERIFY(nvlist_add_nvlist_array(nv,
1417 1492                              ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
1418 1493                          VERIFY(nvlist_add_nvlist(spa->spa_load_info,
1419 1494                              ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
1420 1495  
1421 1496                          for (int i = 0; i < idx; i++)
1422 1497                                  nvlist_free(child[i]);
1423 1498                  }
1424 1499                  nvlist_free(nv);
1425 1500                  kmem_free(child, rvd->vdev_children * sizeof (char **));
1426 1501          }
1427 1502  
1428 1503          /*
1429 1504           * Compare the root vdev tree with the information we have
1430 1505           * from the MOS config (mrvd). Check each top-level vdev
1431 1506           * with the corresponding MOS config top-level (mtvd).
1432 1507           */
1433 1508          for (int c = 0; c < rvd->vdev_children; c++) {
1434 1509                  vdev_t *tvd = rvd->vdev_child[c];
1435 1510                  vdev_t *mtvd  = mrvd->vdev_child[c];
1436 1511  
1437 1512                  /*
1438 1513                   * Resolve any "missing" vdevs in the current configuration.
1439 1514                   * If we find that the MOS config has more accurate information
1440 1515                   * about the top-level vdev then use that vdev instead.
1441 1516                   */
1442 1517                  if (tvd->vdev_ops == &vdev_missing_ops &&
1443 1518                      mtvd->vdev_ops != &vdev_missing_ops) {
1444 1519  
1445 1520                          if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
1446 1521                                  continue;
1447 1522  
1448 1523                          /*
1449 1524                           * Device specific actions.
1450 1525                           */
1451 1526                          if (mtvd->vdev_islog) {
1452 1527                                  spa_set_log_state(spa, SPA_LOG_CLEAR);
1453 1528                          } else {
1454 1529                                  /*
1455 1530                                   * XXX - once we have 'readonly' pool
1456 1531                                   * support we should be able to handle
1457 1532                                   * missing data devices by transitioning
1458 1533                                   * the pool to readonly.
1459 1534                                   */
1460 1535                                  continue;
1461 1536                          }
1462 1537  
1463 1538                          /*
1464 1539                           * Swap the missing vdev with the data we were
1465 1540                           * able to obtain from the MOS config.
1466 1541                           */
1467 1542                          vdev_remove_child(rvd, tvd);
1468 1543                          vdev_remove_child(mrvd, mtvd);
1469 1544  
1470 1545                          vdev_add_child(rvd, mtvd);
1471 1546                          vdev_add_child(mrvd, tvd);
1472 1547  
1473 1548                          spa_config_exit(spa, SCL_ALL, FTAG);
1474 1549                          vdev_load(mtvd);
1475 1550                          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1476 1551  
1477 1552                          vdev_reopen(rvd);
1478 1553                  } else if (mtvd->vdev_islog) {
1479 1554                          /*
1480 1555                           * Load the slog device's state from the MOS config
1481 1556                           * since it's possible that the label does not
1482 1557                           * contain the most up-to-date information.
1483 1558                           */
1484 1559                          vdev_load_log_state(tvd, mtvd);
1485 1560                          vdev_reopen(tvd);
1486 1561                  }
1487 1562          }
1488 1563          vdev_free(mrvd);
1489 1564          spa_config_exit(spa, SCL_ALL, FTAG);
1490 1565  
1491 1566          /*
1492 1567           * Ensure we were able to validate the config.
1493 1568           */
1494 1569          return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
1495 1570  }
1496 1571  
1497 1572  /*
1498 1573   * Check for missing log devices
1499 1574   */
1500 1575  static int
1501 1576  spa_check_logs(spa_t *spa)
1502 1577  {
1503 1578          switch (spa->spa_log_state) {
1504 1579          case SPA_LOG_MISSING:
1505 1580                  /* need to recheck in case slog has been restored */
1506 1581          case SPA_LOG_UNKNOWN:
1507 1582                  if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
1508 1583                      DS_FIND_CHILDREN)) {
1509 1584                          spa_set_log_state(spa, SPA_LOG_MISSING);
1510 1585                          return (1);
1511 1586                  }
1512 1587                  break;
1513 1588          }
1514 1589          return (0);
1515 1590  }
1516 1591  
1517 1592  static boolean_t
1518 1593  spa_passivate_log(spa_t *spa)
1519 1594  {
1520 1595          vdev_t *rvd = spa->spa_root_vdev;
1521 1596          boolean_t slog_found = B_FALSE;
1522 1597  
1523 1598          ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1524 1599  
1525 1600          if (!spa_has_slogs(spa))
1526 1601                  return (B_FALSE);
1527 1602  
1528 1603          for (int c = 0; c < rvd->vdev_children; c++) {
1529 1604                  vdev_t *tvd = rvd->vdev_child[c];
1530 1605                  metaslab_group_t *mg = tvd->vdev_mg;
1531 1606  
1532 1607                  if (tvd->vdev_islog) {
1533 1608                          metaslab_group_passivate(mg);
1534 1609                          slog_found = B_TRUE;
1535 1610                  }
1536 1611          }
1537 1612  
1538 1613          return (slog_found);
1539 1614  }
1540 1615  
1541 1616  static void
1542 1617  spa_activate_log(spa_t *spa)
1543 1618  {
1544 1619          vdev_t *rvd = spa->spa_root_vdev;
1545 1620  
1546 1621          ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
1547 1622  
1548 1623          for (int c = 0; c < rvd->vdev_children; c++) {
1549 1624                  vdev_t *tvd = rvd->vdev_child[c];
1550 1625                  metaslab_group_t *mg = tvd->vdev_mg;
1551 1626  
1552 1627                  if (tvd->vdev_islog)
1553 1628                          metaslab_group_activate(mg);
1554 1629          }
1555 1630  }
1556 1631  
1557 1632  int
1558 1633  spa_offline_log(spa_t *spa)
1559 1634  {
1560 1635          int error = 0;
1561 1636  
1562 1637          if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
1563 1638              NULL, DS_FIND_CHILDREN)) == 0) {
1564 1639  
1565 1640                  /*
1566 1641                   * We successfully offlined the log device, sync out the
1567 1642                   * current txg so that the "stubby" block can be removed
1568 1643                   * by zil_sync().
1569 1644                   */
1570 1645                  txg_wait_synced(spa->spa_dsl_pool, 0);
1571 1646          }
1572 1647          return (error);
1573 1648  }
1574 1649  
1575 1650  static void
1576 1651  spa_aux_check_removed(spa_aux_vdev_t *sav)
1577 1652  {
1578 1653          for (int i = 0; i < sav->sav_count; i++)
1579 1654                  spa_check_removed(sav->sav_vdevs[i]);
1580 1655  }
1581 1656  
1582 1657  void
1583 1658  spa_claim_notify(zio_t *zio)
1584 1659  {
1585 1660          spa_t *spa = zio->io_spa;
1586 1661  
1587 1662          if (zio->io_error)
1588 1663                  return;
1589 1664  
1590 1665          mutex_enter(&spa->spa_props_lock);      /* any mutex will do */
1591 1666          if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
1592 1667                  spa->spa_claim_max_txg = zio->io_bp->blk_birth;
1593 1668          mutex_exit(&spa->spa_props_lock);
1594 1669  }
1595 1670  
1596 1671  typedef struct spa_load_error {
1597 1672          uint64_t        sle_meta_count;
1598 1673          uint64_t        sle_data_count;
1599 1674  } spa_load_error_t;
  
    | 
      ↓ open down ↓ | 
    1003 lines elided | 
    
      ↑ open up ↑ | 
  
1600 1675  
1601 1676  static void
1602 1677  spa_load_verify_done(zio_t *zio)
1603 1678  {
1604 1679          blkptr_t *bp = zio->io_bp;
1605 1680          spa_load_error_t *sle = zio->io_private;
1606 1681          dmu_object_type_t type = BP_GET_TYPE(bp);
1607 1682          int error = zio->io_error;
1608 1683  
1609 1684          if (error) {
1610      -                if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
     1685 +                if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
1611 1686                      type != DMU_OT_INTENT_LOG)
1612 1687                          atomic_add_64(&sle->sle_meta_count, 1);
1613 1688                  else
1614 1689                          atomic_add_64(&sle->sle_data_count, 1);
1615 1690          }
1616 1691          zio_data_buf_free(zio->io_data, zio->io_size);
1617 1692  }
1618 1693  
1619 1694  /*ARGSUSED*/
1620 1695  static int
1621 1696  spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
1622 1697      arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
1623 1698  {
1624 1699          if (bp != NULL) {
1625 1700                  zio_t *rio = arg;
1626 1701                  size_t size = BP_GET_PSIZE(bp);
1627 1702                  void *data = zio_data_buf_alloc(size);
1628 1703  
1629 1704                  zio_nowait(zio_read(rio, spa, bp, data, size,
1630 1705                      spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
1631 1706                      ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
1632 1707                      ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
1633 1708          }
1634 1709          return (0);
1635 1710  }
1636 1711  
1637 1712  static int
1638 1713  spa_load_verify(spa_t *spa)
1639 1714  {
1640 1715          zio_t *rio;
1641 1716          spa_load_error_t sle = { 0 };
1642 1717          zpool_rewind_policy_t policy;
1643 1718          boolean_t verify_ok = B_FALSE;
1644 1719          int error;
1645 1720  
1646 1721          zpool_get_rewind_policy(spa->spa_config, &policy);
1647 1722  
1648 1723          if (policy.zrp_request & ZPOOL_NEVER_REWIND)
1649 1724                  return (0);
1650 1725  
1651 1726          rio = zio_root(spa, NULL, &sle,
1652 1727              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
1653 1728  
1654 1729          error = traverse_pool(spa, spa->spa_verify_min_txg,
1655 1730              TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
1656 1731  
1657 1732          (void) zio_wait(rio);
1658 1733  
1659 1734          spa->spa_load_meta_errors = sle.sle_meta_count;
1660 1735          spa->spa_load_data_errors = sle.sle_data_count;
1661 1736  
1662 1737          if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
1663 1738              sle.sle_data_count <= policy.zrp_maxdata) {
1664 1739                  int64_t loss = 0;
1665 1740  
1666 1741                  verify_ok = B_TRUE;
1667 1742                  spa->spa_load_txg = spa->spa_uberblock.ub_txg;
1668 1743                  spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
1669 1744  
1670 1745                  loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
1671 1746                  VERIFY(nvlist_add_uint64(spa->spa_load_info,
1672 1747                      ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
1673 1748                  VERIFY(nvlist_add_int64(spa->spa_load_info,
1674 1749                      ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
1675 1750                  VERIFY(nvlist_add_uint64(spa->spa_load_info,
1676 1751                      ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
1677 1752          } else {
1678 1753                  spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
1679 1754          }
1680 1755  
1681 1756          if (error) {
1682 1757                  if (error != ENXIO && error != EIO)
1683 1758                          error = EIO;
1684 1759                  return (error);
1685 1760          }
1686 1761  
1687 1762          return (verify_ok ? 0 : EIO);
1688 1763  }
1689 1764  
1690 1765  /*
1691 1766   * Find a value in the pool props object.
1692 1767   */
1693 1768  static void
1694 1769  spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
1695 1770  {
1696 1771          (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
1697 1772              zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
1698 1773  }
1699 1774  
1700 1775  /*
1701 1776   * Find a value in the pool directory object.
1702 1777   */
1703 1778  static int
1704 1779  spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
1705 1780  {
1706 1781          return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
1707 1782              name, sizeof (uint64_t), 1, val));
1708 1783  }
1709 1784  
1710 1785  static int
1711 1786  spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
1712 1787  {
1713 1788          vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
1714 1789          return (err);
1715 1790  }
1716 1791  
1717 1792  /*
1718 1793   * Fix up config after a partly-completed split.  This is done with the
1719 1794   * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
1720 1795   * pool have that entry in their config, but only the splitting one contains
1721 1796   * a list of all the guids of the vdevs that are being split off.
1722 1797   *
1723 1798   * This function determines what to do with that list: either rejoin
1724 1799   * all the disks to the pool, or complete the splitting process.  To attempt
1725 1800   * the rejoin, each disk that is offlined is marked online again, and
1726 1801   * we do a reopen() call.  If the vdev label for every disk that was
1727 1802   * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
1728 1803   * then we call vdev_split() on each disk, and complete the split.
1729 1804   *
1730 1805   * Otherwise we leave the config alone, with all the vdevs in place in
1731 1806   * the original pool.
1732 1807   */
1733 1808  static void
1734 1809  spa_try_repair(spa_t *spa, nvlist_t *config)
1735 1810  {
1736 1811          uint_t extracted;
1737 1812          uint64_t *glist;
1738 1813          uint_t i, gcount;
1739 1814          nvlist_t *nvl;
1740 1815          vdev_t **vd;
1741 1816          boolean_t attempt_reopen;
1742 1817  
1743 1818          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
1744 1819                  return;
1745 1820  
1746 1821          /* check that the config is complete */
1747 1822          if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
1748 1823              &glist, &gcount) != 0)
1749 1824                  return;
1750 1825  
1751 1826          vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
1752 1827  
1753 1828          /* attempt to online all the vdevs & validate */
1754 1829          attempt_reopen = B_TRUE;
1755 1830          for (i = 0; i < gcount; i++) {
1756 1831                  if (glist[i] == 0)      /* vdev is hole */
1757 1832                          continue;
1758 1833  
1759 1834                  vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
1760 1835                  if (vd[i] == NULL) {
1761 1836                          /*
1762 1837                           * Don't bother attempting to reopen the disks;
1763 1838                           * just do the split.
1764 1839                           */
1765 1840                          attempt_reopen = B_FALSE;
1766 1841                  } else {
1767 1842                          /* attempt to re-online it */
1768 1843                          vd[i]->vdev_offline = B_FALSE;
1769 1844                  }
1770 1845          }
1771 1846  
1772 1847          if (attempt_reopen) {
1773 1848                  vdev_reopen(spa->spa_root_vdev);
1774 1849  
1775 1850                  /* check each device to see what state it's in */
1776 1851                  for (extracted = 0, i = 0; i < gcount; i++) {
1777 1852                          if (vd[i] != NULL &&
1778 1853                              vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
1779 1854                                  break;
1780 1855                          ++extracted;
1781 1856                  }
1782 1857          }
1783 1858  
1784 1859          /*
1785 1860           * If every disk has been moved to the new pool, or if we never
1786 1861           * even attempted to look at them, then we split them off for
1787 1862           * good.
1788 1863           */
1789 1864          if (!attempt_reopen || gcount == extracted) {
1790 1865                  for (i = 0; i < gcount; i++)
1791 1866                          if (vd[i] != NULL)
1792 1867                                  vdev_split(vd[i]);
1793 1868                  vdev_reopen(spa->spa_root_vdev);
1794 1869          }
1795 1870  
1796 1871          kmem_free(vd, gcount * sizeof (vdev_t *));
1797 1872  }
1798 1873  
1799 1874  static int
1800 1875  spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
1801 1876      boolean_t mosconfig)
1802 1877  {
1803 1878          nvlist_t *config = spa->spa_config;
1804 1879          char *ereport = FM_EREPORT_ZFS_POOL;
1805 1880          char *comment;
1806 1881          int error;
1807 1882          uint64_t pool_guid;
1808 1883          nvlist_t *nvl;
1809 1884  
1810 1885          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
1811 1886                  return (EINVAL);
1812 1887  
1813 1888          ASSERT(spa->spa_comment == NULL);
1814 1889          if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
1815 1890                  spa->spa_comment = spa_strdup(comment);
1816 1891  
1817 1892          /*
1818 1893           * Versioning wasn't explicitly added to the label until later, so if
1819 1894           * it's not present treat it as the initial version.
1820 1895           */
1821 1896          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
1822 1897              &spa->spa_ubsync.ub_version) != 0)
1823 1898                  spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
1824 1899  
1825 1900          (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
1826 1901              &spa->spa_config_txg);
1827 1902  
1828 1903          if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
1829 1904              spa_guid_exists(pool_guid, 0)) {
  
    | 
      ↓ open down ↓ | 
    209 lines elided | 
    
      ↑ open up ↑ | 
  
1830 1905                  error = EEXIST;
1831 1906          } else {
1832 1907                  spa->spa_config_guid = pool_guid;
1833 1908  
1834 1909                  if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
1835 1910                      &nvl) == 0) {
1836 1911                          VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
1837 1912                              KM_SLEEP) == 0);
1838 1913                  }
1839 1914  
     1915 +                nvlist_free(spa->spa_load_info);
     1916 +                spa->spa_load_info = fnvlist_alloc();
     1917 +
1840 1918                  gethrestime(&spa->spa_loaded_ts);
1841 1919                  error = spa_load_impl(spa, pool_guid, config, state, type,
1842 1920                      mosconfig, &ereport);
1843 1921          }
1844 1922  
1845 1923          spa->spa_minref = refcount_count(&spa->spa_refcount);
1846 1924          if (error) {
1847 1925                  if (error != EEXIST) {
1848 1926                          spa->spa_loaded_ts.tv_sec = 0;
1849 1927                          spa->spa_loaded_ts.tv_nsec = 0;
1850 1928                  }
1851 1929                  if (error != EBADF) {
1852 1930                          zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
1853 1931                  }
1854 1932          }
1855 1933          spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
1856 1934          spa->spa_ena = 0;
1857 1935  
1858 1936          return (error);
1859 1937  }
1860 1938  
1861 1939  /*
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
1862 1940   * Load an existing storage pool, using the pool's builtin spa_config as a
1863 1941   * source of configuration information.
1864 1942   */
1865 1943  static int
1866 1944  spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
1867 1945      spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
1868 1946      char **ereport)
1869 1947  {
1870 1948          int error = 0;
1871 1949          nvlist_t *nvroot = NULL;
     1950 +        nvlist_t *label;
1872 1951          vdev_t *rvd;
1873 1952          uberblock_t *ub = &spa->spa_uberblock;
1874 1953          uint64_t children, config_cache_txg = spa->spa_config_txg;
1875 1954          int orig_mode = spa->spa_mode;
1876 1955          int parse;
1877 1956          uint64_t obj;
     1957 +        boolean_t missing_feat_write = B_FALSE;
1878 1958  
1879 1959          /*
1880 1960           * If this is an untrusted config, access the pool in read-only mode.
1881 1961           * This prevents things like resilvering recently removed devices.
1882 1962           */
1883 1963          if (!mosconfig)
1884 1964                  spa->spa_mode = FREAD;
1885 1965  
1886 1966          ASSERT(MUTEX_HELD(&spa_namespace_lock));
1887 1967  
1888 1968          spa->spa_load_state = state;
1889 1969  
1890 1970          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
1891 1971                  return (EINVAL);
1892 1972  
1893 1973          parse = (type == SPA_IMPORT_EXISTING ?
1894 1974              VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
1895 1975  
1896 1976          /*
1897 1977           * Create "The Godfather" zio to hold all async IOs
1898 1978           */
1899 1979          spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
1900 1980              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
1901 1981  
1902 1982          /*
1903 1983           * Parse the configuration into a vdev tree.  We explicitly set the
1904 1984           * value that will be returned by spa_version() since parsing the
1905 1985           * configuration requires knowing the version number.
1906 1986           */
1907 1987          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1908 1988          error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
1909 1989          spa_config_exit(spa, SCL_ALL, FTAG);
1910 1990  
1911 1991          if (error != 0)
1912 1992                  return (error);
1913 1993  
1914 1994          ASSERT(spa->spa_root_vdev == rvd);
1915 1995  
1916 1996          if (type != SPA_IMPORT_ASSEMBLE) {
1917 1997                  ASSERT(spa_guid(spa) == pool_guid);
1918 1998          }
1919 1999  
1920 2000          /*
1921 2001           * Try to open all vdevs, loading each label in the process.
1922 2002           */
1923 2003          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1924 2004          error = vdev_open(rvd);
1925 2005          spa_config_exit(spa, SCL_ALL, FTAG);
1926 2006          if (error != 0)
1927 2007                  return (error);
1928 2008  
1929 2009          /*
1930 2010           * We need to validate the vdev labels against the configuration that
1931 2011           * we have in hand, which is dependent on the setting of mosconfig. If
1932 2012           * mosconfig is true then we're validating the vdev labels based on
1933 2013           * that config.  Otherwise, we're validating against the cached config
1934 2014           * (zpool.cache) that was read when we loaded the zfs module, and then
1935 2015           * later we will recursively call spa_load() and validate against
1936 2016           * the vdev config.
1937 2017           *
1938 2018           * If we're assembling a new pool that's been split off from an
1939 2019           * existing pool, the labels haven't yet been updated so we skip
1940 2020           * validation for now.
1941 2021           */
1942 2022          if (type != SPA_IMPORT_ASSEMBLE) {
1943 2023                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1944 2024                  error = vdev_validate(rvd, mosconfig);
1945 2025                  spa_config_exit(spa, SCL_ALL, FTAG);
1946 2026  
  
    | 
      ↓ open down ↓ | 
    59 lines elided | 
    
      ↑ open up ↑ | 
  
1947 2027                  if (error != 0)
1948 2028                          return (error);
1949 2029  
1950 2030                  if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
1951 2031                          return (ENXIO);
1952 2032          }
1953 2033  
1954 2034          /*
1955 2035           * Find the best uberblock.
1956 2036           */
1957      -        vdev_uberblock_load(NULL, rvd, ub);
     2037 +        vdev_uberblock_load(rvd, ub, &label);
1958 2038  
1959 2039          /*
1960 2040           * If we weren't able to find a single valid uberblock, return failure.
1961 2041           */
1962      -        if (ub->ub_txg == 0)
     2042 +        if (ub->ub_txg == 0) {
     2043 +                nvlist_free(label);
1963 2044                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
     2045 +        }
1964 2046  
1965 2047          /*
1966      -         * If the pool is newer than the code, we can't open it.
     2048 +         * If the pool has an unsupported version we can't open it.
1967 2049           */
1968      -        if (ub->ub_version > SPA_VERSION)
     2050 +        if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
     2051 +                nvlist_free(label);
1969 2052                  return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
     2053 +        }
1970 2054  
     2055 +        if (ub->ub_version >= SPA_VERSION_FEATURES) {
     2056 +                nvlist_t *features;
     2057 +
     2058 +                /*
     2059 +                 * If we weren't able to find what's necessary for reading the
     2060 +                 * MOS in the label, return failure.
     2061 +                 */
     2062 +                if (label == NULL || nvlist_lookup_nvlist(label,
     2063 +                    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
     2064 +                        nvlist_free(label);
     2065 +                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
     2066 +                            ENXIO));
     2067 +                }
     2068 +
     2069 +                /*
     2070 +                 * Update our in-core representation with the definitive values
     2071 +                 * from the label.
     2072 +                 */
     2073 +                nvlist_free(spa->spa_label_features);
     2074 +                VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
     2075 +        }
     2076 +
     2077 +        nvlist_free(label);
     2078 +
1971 2079          /*
     2080 +         * Look through entries in the label nvlist's features_for_read. If
     2081 +         * there is a feature listed there which we don't understand then we
     2082 +         * cannot open a pool.
     2083 +         */
     2084 +        if (ub->ub_version >= SPA_VERSION_FEATURES) {
     2085 +                nvlist_t *unsup_feat;
     2086 +
     2087 +                VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
     2088 +                    0);
     2089 +
     2090 +                for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
     2091 +                    NULL); nvp != NULL;
     2092 +                    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
     2093 +                        if (!zfeature_is_supported(nvpair_name(nvp))) {
     2094 +                                VERIFY(nvlist_add_string(unsup_feat,
     2095 +                                    nvpair_name(nvp), "") == 0);
     2096 +                        }
     2097 +                }
     2098 +
     2099 +                if (!nvlist_empty(unsup_feat)) {
     2100 +                        VERIFY(nvlist_add_nvlist(spa->spa_load_info,
     2101 +                            ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
     2102 +                        nvlist_free(unsup_feat);
     2103 +                        return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
     2104 +                            ENOTSUP));
     2105 +                }
     2106 +
     2107 +                nvlist_free(unsup_feat);
     2108 +        }
     2109 +
     2110 +        /*
1972 2111           * If the vdev guid sum doesn't match the uberblock, we have an
1973 2112           * incomplete configuration.  We first check to see if the pool
1974 2113           * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
1975 2114           * If it is, defer the vdev_guid_sum check till later so we
1976 2115           * can handle missing vdevs.
1977 2116           */
1978 2117          if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
1979 2118              &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
1980 2119              rvd->vdev_guid_sum != ub->ub_guid_sum)
1981 2120                  return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
1982 2121  
1983 2122          if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
1984 2123                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
1985 2124                  spa_try_repair(spa, config);
1986 2125                  spa_config_exit(spa, SCL_ALL, FTAG);
1987 2126                  nvlist_free(spa->spa_config_splitting);
1988 2127                  spa->spa_config_splitting = NULL;
1989 2128          }
1990 2129  
1991 2130          /*
1992 2131           * Initialize internal SPA structures.
  
    | 
      ↓ open down ↓ | 
    11 lines elided | 
    
      ↑ open up ↑ | 
  
1993 2132           */
1994 2133          spa->spa_state = POOL_STATE_ACTIVE;
1995 2134          spa->spa_ubsync = spa->spa_uberblock;
1996 2135          spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
1997 2136              TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
1998 2137          spa->spa_first_txg = spa->spa_last_ubsync_txg ?
1999 2138              spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
2000 2139          spa->spa_claim_max_txg = spa->spa_first_txg;
2001 2140          spa->spa_prev_software_version = ub->ub_software_version;
2002 2141  
2003      -        error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
     2142 +        error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
2004 2143          if (error)
2005 2144                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2006 2145          spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
2007 2146  
2008 2147          if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
2009 2148                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2010 2149  
     2150 +        if (spa_version(spa) >= SPA_VERSION_FEATURES) {
     2151 +                boolean_t missing_feat_read = B_FALSE;
     2152 +                nvlist_t *unsup_feat;
     2153 +
     2154 +                if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
     2155 +                    &spa->spa_feat_for_read_obj) != 0) {
     2156 +                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
     2157 +                }
     2158 +
     2159 +                if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
     2160 +                    &spa->spa_feat_for_write_obj) != 0) {
     2161 +                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
     2162 +                }
     2163 +
     2164 +                if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
     2165 +                    &spa->spa_feat_desc_obj) != 0) {
     2166 +                        return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
     2167 +                }
     2168 +
     2169 +                VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
     2170 +                    0);
     2171 +
     2172 +                if (!feature_is_supported(spa->spa_meta_objset,
     2173 +                    spa->spa_feat_for_read_obj, spa->spa_feat_desc_obj,
     2174 +                    unsup_feat))
     2175 +                        missing_feat_read = B_TRUE;
     2176 +
     2177 +                if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
     2178 +                        if (!feature_is_supported(spa->spa_meta_objset,
     2179 +                            spa->spa_feat_for_write_obj, spa->spa_feat_desc_obj,
     2180 +                            unsup_feat))
     2181 +                                missing_feat_write = B_TRUE;
     2182 +                }
     2183 +
     2184 +                if (!nvlist_empty(unsup_feat)) {
     2185 +                        VERIFY(nvlist_add_nvlist(spa->spa_load_info,
     2186 +                            ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
     2187 +                }
     2188 +
     2189 +                nvlist_free(unsup_feat);
     2190 +
     2191 +                if (!missing_feat_read) {
     2192 +                        fnvlist_add_boolean(spa->spa_load_info,
     2193 +                            ZPOOL_CONFIG_CAN_RDONLY);
     2194 +                }
     2195 +
     2196 +                /*
     2197 +                 * If the state is SPA_LOAD_TRYIMPORT, our objective is
     2198 +                 * twofold: to determine whether the pool is available for
     2199 +                 * import in read-write mode and (if it is not) whether the
     2200 +                 * pool is available for import in read-only mode. If the pool
     2201 +                 * is available for import in read-write mode, it is displayed
     2202 +                 * as available in userland; if it is not available for import
     2203 +                 * in read-only mode, it is displayed as unavailable in
     2204 +                 * userland. If the pool is available for import in read-only
     2205 +                 * mode but not read-write mode, it is displayed as unavailable
     2206 +                 * in userland with a special note that the pool is actually
     2207 +                 * available for open in read-only mode.
     2208 +                 *
     2209 +                 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
     2210 +                 * missing a feature for write, we must first determine whether
     2211 +                 * the pool can be opened read-only before returning to
     2212 +                 * userland in order to know whether to display the
     2213 +                 * abovementioned note.
     2214 +                 */
     2215 +                if (missing_feat_read || (missing_feat_write &&
     2216 +                    spa_writeable(spa))) {
     2217 +                        return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
     2218 +                            ENOTSUP));
     2219 +                }
     2220 +        }
     2221 +
     2222 +        spa->spa_is_initializing = B_TRUE;
     2223 +        error = dsl_pool_open(spa->spa_dsl_pool);
     2224 +        spa->spa_is_initializing = B_FALSE;
     2225 +        if (error != 0)
     2226 +                return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
     2227 +
2011 2228          if (!mosconfig) {
2012 2229                  uint64_t hostid;
2013 2230                  nvlist_t *policy = NULL, *nvconfig;
2014 2231  
2015 2232                  if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
2016 2233                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2017 2234  
2018 2235                  if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
2019 2236                      ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
2020 2237                          char *hostname;
2021 2238                          unsigned long myhostid = 0;
2022 2239  
2023 2240                          VERIFY(nvlist_lookup_string(nvconfig,
2024 2241                              ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
2025 2242  
2026 2243  #ifdef  _KERNEL
2027 2244                          myhostid = zone_get_hostid(NULL);
2028 2245  #else   /* _KERNEL */
2029 2246                          /*
2030 2247                           * We're emulating the system's hostid in userland, so
2031 2248                           * we can't use zone_get_hostid().
2032 2249                           */
2033 2250                          (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
2034 2251  #endif  /* _KERNEL */
2035 2252                          if (hostid != 0 && myhostid != 0 &&
2036 2253                              hostid != myhostid) {
2037 2254                                  nvlist_free(nvconfig);
2038 2255                                  cmn_err(CE_WARN, "pool '%s' could not be "
2039 2256                                      "loaded as it was last accessed by "
2040 2257                                      "another system (host: %s hostid: 0x%lx). "
2041 2258                                      "See: http://illumos.org/msg/ZFS-8000-EY",
2042 2259                                      spa_name(spa), hostname,
2043 2260                                      (unsigned long)hostid);
2044 2261                                  return (EBADF);
2045 2262                          }
2046 2263                  }
2047 2264                  if (nvlist_lookup_nvlist(spa->spa_config,
2048 2265                      ZPOOL_REWIND_POLICY, &policy) == 0)
2049 2266                          VERIFY(nvlist_add_nvlist(nvconfig,
2050 2267                              ZPOOL_REWIND_POLICY, policy) == 0);
2051 2268  
2052 2269                  spa_config_set(spa, nvconfig);
2053 2270                  spa_unload(spa);
2054 2271                  spa_deactivate(spa);
2055 2272                  spa_activate(spa, orig_mode);
2056 2273  
2057 2274                  return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
2058 2275          }
2059 2276  
2060 2277          if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
2061 2278                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2062 2279          error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
2063 2280          if (error != 0)
2064 2281                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2065 2282  
2066 2283          /*
2067 2284           * Load the bit that tells us to use the new accounting function
2068 2285           * (raid-z deflation).  If we have an older pool, this will not
2069 2286           * be present.
2070 2287           */
2071 2288          error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
2072 2289          if (error != 0 && error != ENOENT)
2073 2290                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2074 2291  
2075 2292          error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
2076 2293              &spa->spa_creation_version);
2077 2294          if (error != 0 && error != ENOENT)
2078 2295                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2079 2296  
2080 2297          /*
2081 2298           * Load the persistent error log.  If we have an older pool, this will
2082 2299           * not be present.
2083 2300           */
2084 2301          error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
2085 2302          if (error != 0 && error != ENOENT)
2086 2303                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2087 2304  
2088 2305          error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
2089 2306              &spa->spa_errlog_scrub);
2090 2307          if (error != 0 && error != ENOENT)
2091 2308                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2092 2309  
2093 2310          /*
2094 2311           * Load the history object.  If we have an older pool, this
2095 2312           * will not be present.
2096 2313           */
2097 2314          error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
2098 2315          if (error != 0 && error != ENOENT)
2099 2316                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2100 2317  
2101 2318          /*
2102 2319           * If we're assembling the pool from the split-off vdevs of
2103 2320           * an existing pool, we don't want to attach the spares & cache
2104 2321           * devices.
2105 2322           */
2106 2323  
2107 2324          /*
2108 2325           * Load any hot spares for this pool.
2109 2326           */
2110 2327          error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
2111 2328          if (error != 0 && error != ENOENT)
2112 2329                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2113 2330          if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2114 2331                  ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
2115 2332                  if (load_nvlist(spa, spa->spa_spares.sav_object,
2116 2333                      &spa->spa_spares.sav_config) != 0)
2117 2334                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2118 2335  
2119 2336                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2120 2337                  spa_load_spares(spa);
2121 2338                  spa_config_exit(spa, SCL_ALL, FTAG);
2122 2339          } else if (error == 0) {
2123 2340                  spa->spa_spares.sav_sync = B_TRUE;
2124 2341          }
2125 2342  
2126 2343          /*
2127 2344           * Load any level 2 ARC devices for this pool.
2128 2345           */
2129 2346          error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
2130 2347              &spa->spa_l2cache.sav_object);
2131 2348          if (error != 0 && error != ENOENT)
2132 2349                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2133 2350          if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
2134 2351                  ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
2135 2352                  if (load_nvlist(spa, spa->spa_l2cache.sav_object,
2136 2353                      &spa->spa_l2cache.sav_config) != 0)
2137 2354                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2138 2355  
2139 2356                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2140 2357                  spa_load_l2cache(spa);
2141 2358                  spa_config_exit(spa, SCL_ALL, FTAG);
2142 2359          } else if (error == 0) {
2143 2360                  spa->spa_l2cache.sav_sync = B_TRUE;
2144 2361          }
2145 2362  
2146 2363          spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
2147 2364  
2148 2365          error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
2149 2366          if (error && error != ENOENT)
2150 2367                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2151 2368  
2152 2369          if (error == 0) {
2153 2370                  uint64_t autoreplace;
2154 2371  
2155 2372                  spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
2156 2373                  spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
2157 2374                  spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
2158 2375                  spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
2159 2376                  spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
2160 2377                  spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
2161 2378                      &spa->spa_dedup_ditto);
2162 2379  
2163 2380                  spa->spa_autoreplace = (autoreplace != 0);
2164 2381          }
2165 2382  
2166 2383          /*
2167 2384           * If the 'autoreplace' property is set, then post a resource notifying
2168 2385           * the ZFS DE that it should not issue any faults for unopenable
2169 2386           * devices.  We also iterate over the vdevs, and post a sysevent for any
2170 2387           * unopenable vdevs so that the normal autoreplace handler can take
2171 2388           * over.
2172 2389           */
2173 2390          if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
2174 2391                  spa_check_removed(spa->spa_root_vdev);
2175 2392                  /*
2176 2393                   * For the import case, this is done in spa_import(), because
2177 2394                   * at this point we're using the spare definitions from
2178 2395                   * the MOS config, not necessarily from the userland config.
2179 2396                   */
2180 2397                  if (state != SPA_LOAD_IMPORT) {
2181 2398                          spa_aux_check_removed(&spa->spa_spares);
2182 2399                          spa_aux_check_removed(&spa->spa_l2cache);
2183 2400                  }
2184 2401          }
2185 2402  
2186 2403          /*
2187 2404           * Load the vdev state for all toplevel vdevs.
2188 2405           */
2189 2406          vdev_load(rvd);
2190 2407  
2191 2408          /*
2192 2409           * Propagate the leaf DTLs we just loaded all the way up the tree.
2193 2410           */
2194 2411          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2195 2412          vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
2196 2413          spa_config_exit(spa, SCL_ALL, FTAG);
2197 2414  
2198 2415          /*
2199 2416           * Load the DDTs (dedup tables).
2200 2417           */
2201 2418          error = ddt_load(spa);
2202 2419          if (error != 0)
2203 2420                  return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2204 2421  
2205 2422          spa_update_dspace(spa);
2206 2423  
2207 2424          /*
2208 2425           * Validate the config, using the MOS config to fill in any
2209 2426           * information which might be missing.  If we fail to validate
2210 2427           * the config then declare the pool unfit for use. If we're
2211 2428           * assembling a pool from a split, the log is not transferred
2212 2429           * over.
2213 2430           */
2214 2431          if (type != SPA_IMPORT_ASSEMBLE) {
2215 2432                  nvlist_t *nvconfig;
2216 2433  
2217 2434                  if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
  
    | 
      ↓ open down ↓ | 
    197 lines elided | 
    
      ↑ open up ↑ | 
  
2218 2435                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
2219 2436  
2220 2437                  if (!spa_config_valid(spa, nvconfig)) {
2221 2438                          nvlist_free(nvconfig);
2222 2439                          return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
2223 2440                              ENXIO));
2224 2441                  }
2225 2442                  nvlist_free(nvconfig);
2226 2443  
2227 2444                  /*
2228      -                 * Now that we've validate the config, check the state of the
     2445 +                 * Now that we've validated the config, check the state of the
2229 2446                   * root vdev.  If it can't be opened, it indicates one or
2230 2447                   * more toplevel vdevs are faulted.
2231 2448                   */
2232 2449                  if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
2233 2450                          return (ENXIO);
2234 2451  
2235 2452                  if (spa_check_logs(spa)) {
2236 2453                          *ereport = FM_EREPORT_ZFS_LOG_REPLAY;
2237 2454                          return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
2238 2455                  }
2239 2456          }
2240 2457  
     2458 +        if (missing_feat_write) {
     2459 +                ASSERT(state == SPA_LOAD_TRYIMPORT);
     2460 +
     2461 +                /*
     2462 +                 * At this point, we know that we can open the pool in
     2463 +                 * read-only mode but not read-write mode. We now have enough
     2464 +                 * information and can return to userland.
     2465 +                 */
     2466 +                return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
     2467 +        }
     2468 +
2241 2469          /*
2242 2470           * We've successfully opened the pool, verify that we're ready
2243 2471           * to start pushing transactions.
2244 2472           */
2245 2473          if (state != SPA_LOAD_TRYIMPORT) {
2246 2474                  if (error = spa_load_verify(spa))
2247 2475                          return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
2248 2476                              error));
2249 2477          }
2250 2478  
2251 2479          if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
2252 2480              spa->spa_load_max_txg == UINT64_MAX)) {
2253 2481                  dmu_tx_t *tx;
2254 2482                  int need_update = B_FALSE;
2255 2483  
2256 2484                  ASSERT(state != SPA_LOAD_TRYIMPORT);
2257 2485  
2258 2486                  /*
2259 2487                   * Claim log blocks that haven't been committed yet.
2260 2488                   * This must all happen in a single txg.
2261 2489                   * Note: spa_claim_max_txg is updated by spa_claim_notify(),
2262 2490                   * invoked from zil_claim_log_block()'s i/o done callback.
2263 2491                   * Price of rollback is that we abandon the log.
2264 2492                   */
2265 2493                  spa->spa_claiming = B_TRUE;
2266 2494  
2267 2495                  tx = dmu_tx_create_assigned(spa_get_dsl(spa),
2268 2496                      spa_first_txg(spa));
2269 2497                  (void) dmu_objset_find(spa_name(spa),
2270 2498                      zil_claim, tx, DS_FIND_CHILDREN);
2271 2499                  dmu_tx_commit(tx);
2272 2500  
2273 2501                  spa->spa_claiming = B_FALSE;
2274 2502  
2275 2503                  spa_set_log_state(spa, SPA_LOG_GOOD);
2276 2504                  spa->spa_sync_on = B_TRUE;
2277 2505                  txg_sync_start(spa->spa_dsl_pool);
2278 2506  
2279 2507                  /*
2280 2508                   * Wait for all claims to sync.  We sync up to the highest
2281 2509                   * claimed log block birth time so that claimed log blocks
2282 2510                   * don't appear to be from the future.  spa_claim_max_txg
2283 2511                   * will have been set for us by either zil_check_log_chain()
2284 2512                   * (invoked from spa_check_logs()) or zil_claim() above.
2285 2513                   */
2286 2514                  txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
2287 2515  
2288 2516                  /*
2289 2517                   * If the config cache is stale, or we have uninitialized
2290 2518                   * metaslabs (see spa_vdev_add()), then update the config.
2291 2519                   *
2292 2520                   * If this is a verbatim import, trust the current
2293 2521                   * in-core spa_config and update the disk labels.
2294 2522                   */
2295 2523                  if (config_cache_txg != spa->spa_config_txg ||
2296 2524                      state == SPA_LOAD_IMPORT ||
2297 2525                      state == SPA_LOAD_RECOVER ||
2298 2526                      (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
2299 2527                          need_update = B_TRUE;
2300 2528  
2301 2529                  for (int c = 0; c < rvd->vdev_children; c++)
2302 2530                          if (rvd->vdev_child[c]->vdev_ms_array == 0)
2303 2531                                  need_update = B_TRUE;
2304 2532  
2305 2533                  /*
2306 2534                   * Update the config cache asychronously in case we're the
2307 2535                   * root pool, in which case the config cache isn't writable yet.
2308 2536                   */
2309 2537                  if (need_update)
2310 2538                          spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
2311 2539  
2312 2540                  /*
2313 2541                   * Check all DTLs to see if anything needs resilvering.
2314 2542                   */
2315 2543                  if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
2316 2544                      vdev_resilver_needed(rvd, NULL, NULL))
2317 2545                          spa_async_request(spa, SPA_ASYNC_RESILVER);
2318 2546  
2319 2547                  /*
2320 2548                   * Delete any inconsistent datasets.
2321 2549                   */
2322 2550                  (void) dmu_objset_find(spa_name(spa),
2323 2551                      dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
2324 2552  
2325 2553                  /*
2326 2554                   * Clean up any stale temporary dataset userrefs.
2327 2555                   */
2328 2556                  dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
2329 2557          }
2330 2558  
2331 2559          return (0);
2332 2560  }
2333 2561  
2334 2562  static int
2335 2563  spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
2336 2564  {
2337 2565          int mode = spa->spa_mode;
2338 2566  
2339 2567          spa_unload(spa);
  
    | 
      ↓ open down ↓ | 
    89 lines elided | 
    
      ↑ open up ↑ | 
  
2340 2568          spa_deactivate(spa);
2341 2569  
2342 2570          spa->spa_load_max_txg--;
2343 2571  
2344 2572          spa_activate(spa, mode);
2345 2573          spa_async_suspend(spa);
2346 2574  
2347 2575          return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
2348 2576  }
2349 2577  
     2578 +/*
     2579 + * If spa_load() fails this function will try loading prior txg's. If
     2580 + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
     2581 + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
     2582 + * function will not rewind the pool and will return the same error as
     2583 + * spa_load().
     2584 + */
2350 2585  static int
2351 2586  spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
2352 2587      uint64_t max_request, int rewind_flags)
2353 2588  {
     2589 +        nvlist_t *loadinfo = NULL;
2354 2590          nvlist_t *config = NULL;
2355 2591          int load_error, rewind_error;
2356 2592          uint64_t safe_rewind_txg;
2357 2593          uint64_t min_txg;
2358 2594  
2359 2595          if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
2360 2596                  spa->spa_load_max_txg = spa->spa_load_txg;
2361 2597                  spa_set_log_state(spa, SPA_LOG_CLEAR);
2362 2598          } else {
2363 2599                  spa->spa_load_max_txg = max_request;
2364 2600          }
2365 2601  
2366 2602          load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
2367 2603              mosconfig);
2368 2604          if (load_error == 0)
2369 2605                  return (0);
2370 2606  
2371 2607          if (spa->spa_root_vdev != NULL)
  
    | 
      ↓ open down ↓ | 
    8 lines elided | 
    
      ↑ open up ↑ | 
  
2372 2608                  config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2373 2609  
2374 2610          spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
2375 2611          spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
2376 2612  
2377 2613          if (rewind_flags & ZPOOL_NEVER_REWIND) {
2378 2614                  nvlist_free(config);
2379 2615                  return (load_error);
2380 2616          }
2381 2617  
2382      -        /* Price of rolling back is discarding txgs, including log */
2383      -        if (state == SPA_LOAD_RECOVER)
     2618 +        if (state == SPA_LOAD_RECOVER) {
     2619 +                /* Price of rolling back is discarding txgs, including log */
2384 2620                  spa_set_log_state(spa, SPA_LOG_CLEAR);
     2621 +        } else {
     2622 +                /*
     2623 +                 * If we aren't rolling back save the load info from our first
     2624 +                 * import attempt so that we can restore it after attempting
     2625 +                 * to rewind.
     2626 +                 */
     2627 +                loadinfo = spa->spa_load_info;
     2628 +                spa->spa_load_info = fnvlist_alloc();
     2629 +        }
2385 2630  
2386 2631          spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
2387 2632          safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
2388 2633          min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
2389 2634              TXG_INITIAL : safe_rewind_txg;
2390 2635  
2391 2636          /*
2392 2637           * Continue as long as we're finding errors, we're still within
2393 2638           * the acceptable rewind range, and we're still finding uberblocks
2394 2639           */
2395 2640          while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
2396 2641              spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
2397 2642                  if (spa->spa_load_max_txg < safe_rewind_txg)
  
    | 
      ↓ open down ↓ | 
    3 lines elided | 
    
      ↑ open up ↑ | 
  
2398 2643                          spa->spa_extreme_rewind = B_TRUE;
2399 2644                  rewind_error = spa_load_retry(spa, state, mosconfig);
2400 2645          }
2401 2646  
2402 2647          spa->spa_extreme_rewind = B_FALSE;
2403 2648          spa->spa_load_max_txg = UINT64_MAX;
2404 2649  
2405 2650          if (config && (rewind_error || state != SPA_LOAD_RECOVER))
2406 2651                  spa_config_set(spa, config);
2407 2652  
2408      -        return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
     2653 +        if (state == SPA_LOAD_RECOVER) {
     2654 +                ASSERT3P(loadinfo, ==, NULL);
     2655 +                return (rewind_error);
     2656 +        } else {
     2657 +                /* Store the rewind info as part of the initial load info */
     2658 +                fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
     2659 +                    spa->spa_load_info);
     2660 +
     2661 +                /* Restore the initial load info */
     2662 +                fnvlist_free(spa->spa_load_info);
     2663 +                spa->spa_load_info = loadinfo;
     2664 +
     2665 +                return (load_error);
     2666 +        }
2409 2667  }
2410 2668  
2411 2669  /*
2412 2670   * Pool Open/Import
2413 2671   *
2414 2672   * The import case is identical to an open except that the configuration is sent
2415 2673   * down from userland, instead of grabbed from the configuration cache.  For the
2416 2674   * case of an open, the pool configuration will exist in the
2417 2675   * POOL_STATE_UNINITIALIZED state.
2418 2676   *
2419 2677   * The stats information (gen/count/ustats) is used to gather vdev statistics at
2420 2678   * the same time open the pool, without having to keep around the spa_t in some
2421 2679   * ambiguous state.
2422 2680   */
2423 2681  static int
2424 2682  spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
2425 2683      nvlist_t **config)
2426 2684  {
2427 2685          spa_t *spa;
2428 2686          spa_load_state_t state = SPA_LOAD_OPEN;
2429 2687          int error;
2430 2688          int locked = B_FALSE;
2431 2689  
2432 2690          *spapp = NULL;
2433 2691  
2434 2692          /*
2435 2693           * As disgusting as this is, we need to support recursive calls to this
2436 2694           * function because dsl_dir_open() is called during spa_load(), and ends
2437 2695           * up calling spa_open() again.  The real fix is to figure out how to
2438 2696           * avoid dsl_dir_open() calling this in the first place.
2439 2697           */
2440 2698          if (mutex_owner(&spa_namespace_lock) != curthread) {
2441 2699                  mutex_enter(&spa_namespace_lock);
2442 2700                  locked = B_TRUE;
2443 2701          }
2444 2702  
2445 2703          if ((spa = spa_lookup(pool)) == NULL) {
2446 2704                  if (locked)
2447 2705                          mutex_exit(&spa_namespace_lock);
2448 2706                  return (ENOENT);
2449 2707          }
2450 2708  
2451 2709          if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
2452 2710                  zpool_rewind_policy_t policy;
2453 2711  
2454 2712                  zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
2455 2713                      &policy);
2456 2714                  if (policy.zrp_request & ZPOOL_DO_REWIND)
2457 2715                          state = SPA_LOAD_RECOVER;
2458 2716  
2459 2717                  spa_activate(spa, spa_mode_global);
2460 2718  
2461 2719                  if (state != SPA_LOAD_RECOVER)
2462 2720                          spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
2463 2721  
2464 2722                  error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
2465 2723                      policy.zrp_request);
2466 2724  
2467 2725                  if (error == EBADF) {
2468 2726                          /*
2469 2727                           * If vdev_validate() returns failure (indicated by
2470 2728                           * EBADF), it indicates that one of the vdevs indicates
2471 2729                           * that the pool has been exported or destroyed.  If
2472 2730                           * this is the case, the config cache is out of sync and
2473 2731                           * we should remove the pool from the namespace.
2474 2732                           */
2475 2733                          spa_unload(spa);
2476 2734                          spa_deactivate(spa);
2477 2735                          spa_config_sync(spa, B_TRUE, B_TRUE);
2478 2736                          spa_remove(spa);
2479 2737                          if (locked)
2480 2738                                  mutex_exit(&spa_namespace_lock);
2481 2739                          return (ENOENT);
2482 2740                  }
2483 2741  
2484 2742                  if (error) {
2485 2743                          /*
2486 2744                           * We can't open the pool, but we still have useful
2487 2745                           * information: the state of each vdev after the
2488 2746                           * attempted vdev_open().  Return this to the user.
2489 2747                           */
2490 2748                          if (config != NULL && spa->spa_config) {
2491 2749                                  VERIFY(nvlist_dup(spa->spa_config, config,
2492 2750                                      KM_SLEEP) == 0);
2493 2751                                  VERIFY(nvlist_add_nvlist(*config,
2494 2752                                      ZPOOL_CONFIG_LOAD_INFO,
2495 2753                                      spa->spa_load_info) == 0);
2496 2754                          }
2497 2755                          spa_unload(spa);
2498 2756                          spa_deactivate(spa);
2499 2757                          spa->spa_last_open_failed = error;
2500 2758                          if (locked)
2501 2759                                  mutex_exit(&spa_namespace_lock);
2502 2760                          *spapp = NULL;
2503 2761                          return (error);
2504 2762                  }
2505 2763          }
2506 2764  
2507 2765          spa_open_ref(spa, tag);
2508 2766  
2509 2767          if (config != NULL)
2510 2768                  *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
2511 2769  
2512 2770          /*
2513 2771           * If we've recovered the pool, pass back any information we
2514 2772           * gathered while doing the load.
2515 2773           */
2516 2774          if (state == SPA_LOAD_RECOVER) {
2517 2775                  VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
2518 2776                      spa->spa_load_info) == 0);
2519 2777          }
2520 2778  
2521 2779          if (locked) {
2522 2780                  spa->spa_last_open_failed = 0;
2523 2781                  spa->spa_last_ubsync_txg = 0;
2524 2782                  spa->spa_load_txg = 0;
2525 2783                  mutex_exit(&spa_namespace_lock);
2526 2784          }
2527 2785  
2528 2786          *spapp = spa;
2529 2787  
2530 2788          return (0);
2531 2789  }
2532 2790  
2533 2791  int
2534 2792  spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
2535 2793      nvlist_t **config)
2536 2794  {
2537 2795          return (spa_open_common(name, spapp, tag, policy, config));
2538 2796  }
2539 2797  
2540 2798  int
2541 2799  spa_open(const char *name, spa_t **spapp, void *tag)
2542 2800  {
2543 2801          return (spa_open_common(name, spapp, tag, NULL, NULL));
2544 2802  }
2545 2803  
2546 2804  /*
2547 2805   * Lookup the given spa_t, incrementing the inject count in the process,
2548 2806   * preventing it from being exported or destroyed.
2549 2807   */
2550 2808  spa_t *
2551 2809  spa_inject_addref(char *name)
2552 2810  {
2553 2811          spa_t *spa;
2554 2812  
2555 2813          mutex_enter(&spa_namespace_lock);
2556 2814          if ((spa = spa_lookup(name)) == NULL) {
2557 2815                  mutex_exit(&spa_namespace_lock);
2558 2816                  return (NULL);
2559 2817          }
2560 2818          spa->spa_inject_ref++;
2561 2819          mutex_exit(&spa_namespace_lock);
2562 2820  
2563 2821          return (spa);
2564 2822  }
2565 2823  
2566 2824  void
2567 2825  spa_inject_delref(spa_t *spa)
2568 2826  {
2569 2827          mutex_enter(&spa_namespace_lock);
2570 2828          spa->spa_inject_ref--;
2571 2829          mutex_exit(&spa_namespace_lock);
2572 2830  }
2573 2831  
2574 2832  /*
2575 2833   * Add spares device information to the nvlist.
2576 2834   */
2577 2835  static void
2578 2836  spa_add_spares(spa_t *spa, nvlist_t *config)
2579 2837  {
2580 2838          nvlist_t **spares;
2581 2839          uint_t i, nspares;
2582 2840          nvlist_t *nvroot;
2583 2841          uint64_t guid;
2584 2842          vdev_stat_t *vs;
2585 2843          uint_t vsc;
2586 2844          uint64_t pool;
2587 2845  
2588 2846          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2589 2847  
2590 2848          if (spa->spa_spares.sav_count == 0)
2591 2849                  return;
2592 2850  
2593 2851          VERIFY(nvlist_lookup_nvlist(config,
2594 2852              ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2595 2853          VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
2596 2854              ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2597 2855          if (nspares != 0) {
2598 2856                  VERIFY(nvlist_add_nvlist_array(nvroot,
2599 2857                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
2600 2858                  VERIFY(nvlist_lookup_nvlist_array(nvroot,
2601 2859                      ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
2602 2860  
2603 2861                  /*
2604 2862                   * Go through and find any spares which have since been
2605 2863                   * repurposed as an active spare.  If this is the case, update
2606 2864                   * their status appropriately.
2607 2865                   */
2608 2866                  for (i = 0; i < nspares; i++) {
2609 2867                          VERIFY(nvlist_lookup_uint64(spares[i],
2610 2868                              ZPOOL_CONFIG_GUID, &guid) == 0);
2611 2869                          if (spa_spare_exists(guid, &pool, NULL) &&
2612 2870                              pool != 0ULL) {
2613 2871                                  VERIFY(nvlist_lookup_uint64_array(
2614 2872                                      spares[i], ZPOOL_CONFIG_VDEV_STATS,
2615 2873                                      (uint64_t **)&vs, &vsc) == 0);
2616 2874                                  vs->vs_state = VDEV_STATE_CANT_OPEN;
2617 2875                                  vs->vs_aux = VDEV_AUX_SPARED;
2618 2876                          }
2619 2877                  }
2620 2878          }
2621 2879  }
2622 2880  
2623 2881  /*
2624 2882   * Add l2cache device information to the nvlist, including vdev stats.
2625 2883   */
2626 2884  static void
2627 2885  spa_add_l2cache(spa_t *spa, nvlist_t *config)
2628 2886  {
2629 2887          nvlist_t **l2cache;
2630 2888          uint_t i, j, nl2cache;
2631 2889          nvlist_t *nvroot;
2632 2890          uint64_t guid;
2633 2891          vdev_t *vd;
2634 2892          vdev_stat_t *vs;
2635 2893          uint_t vsc;
2636 2894  
2637 2895          ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
2638 2896  
2639 2897          if (spa->spa_l2cache.sav_count == 0)
2640 2898                  return;
2641 2899  
2642 2900          VERIFY(nvlist_lookup_nvlist(config,
2643 2901              ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
2644 2902          VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
2645 2903              ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2646 2904          if (nl2cache != 0) {
2647 2905                  VERIFY(nvlist_add_nvlist_array(nvroot,
2648 2906                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
2649 2907                  VERIFY(nvlist_lookup_nvlist_array(nvroot,
2650 2908                      ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
2651 2909  
2652 2910                  /*
2653 2911                   * Update level 2 cache device stats.
2654 2912                   */
2655 2913  
2656 2914                  for (i = 0; i < nl2cache; i++) {
2657 2915                          VERIFY(nvlist_lookup_uint64(l2cache[i],
2658 2916                              ZPOOL_CONFIG_GUID, &guid) == 0);
2659 2917  
2660 2918                          vd = NULL;
2661 2919                          for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
2662 2920                                  if (guid ==
2663 2921                                      spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
2664 2922                                          vd = spa->spa_l2cache.sav_vdevs[j];
2665 2923                                          break;
2666 2924                                  }
2667 2925                          }
  
    | 
      ↓ open down ↓ | 
    249 lines elided | 
    
      ↑ open up ↑ | 
  
2668 2926                          ASSERT(vd != NULL);
2669 2927  
2670 2928                          VERIFY(nvlist_lookup_uint64_array(l2cache[i],
2671 2929                              ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
2672 2930                              == 0);
2673 2931                          vdev_get_stats(vd, vs);
2674 2932                  }
2675 2933          }
2676 2934  }
2677 2935  
     2936 +static void
     2937 +spa_add_feature_stats(spa_t *spa, nvlist_t *config)
     2938 +{
     2939 +        nvlist_t *features;
     2940 +        zap_cursor_t zc;
     2941 +        zap_attribute_t za;
     2942 +
     2943 +        ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
     2944 +        VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
     2945 +
     2946 +        if (spa->spa_feat_for_read_obj != 0) {
     2947 +                for (zap_cursor_init(&zc, spa->spa_meta_objset,
     2948 +                    spa->spa_feat_for_read_obj);
     2949 +                    zap_cursor_retrieve(&zc, &za) == 0;
     2950 +                    zap_cursor_advance(&zc)) {
     2951 +                        ASSERT(za.za_integer_length == sizeof (uint64_t) &&
     2952 +                            za.za_num_integers == 1);
     2953 +                        VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
     2954 +                            za.za_first_integer));
     2955 +                }
     2956 +                zap_cursor_fini(&zc);
     2957 +        }
     2958 +
     2959 +        if (spa->spa_feat_for_write_obj != 0) {
     2960 +                for (zap_cursor_init(&zc, spa->spa_meta_objset,
     2961 +                    spa->spa_feat_for_write_obj);
     2962 +                    zap_cursor_retrieve(&zc, &za) == 0;
     2963 +                    zap_cursor_advance(&zc)) {
     2964 +                        ASSERT(za.za_integer_length == sizeof (uint64_t) &&
     2965 +                            za.za_num_integers == 1);
     2966 +                        VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
     2967 +                            za.za_first_integer));
     2968 +                }
     2969 +                zap_cursor_fini(&zc);
     2970 +        }
     2971 +
     2972 +        VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
     2973 +            features) == 0);
     2974 +        nvlist_free(features);
     2975 +}
     2976 +
2678 2977  int
2679      -spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
     2978 +spa_get_stats(const char *name, nvlist_t **config,
     2979 +    char *altroot, size_t buflen)
2680 2980  {
2681 2981          int error;
2682 2982          spa_t *spa;
2683 2983  
2684 2984          *config = NULL;
2685 2985          error = spa_open_common(name, &spa, FTAG, NULL, config);
2686 2986  
2687 2987          if (spa != NULL) {
2688 2988                  /*
2689 2989                   * This still leaves a window of inconsistency where the spares
2690 2990                   * or l2cache devices could change and the config would be
2691 2991                   * self-inconsistent.
2692 2992                   */
2693 2993                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
2694 2994  
2695 2995                  if (*config != NULL) {
2696 2996                          uint64_t loadtimes[2];
2697 2997  
2698 2998                          loadtimes[0] = spa->spa_loaded_ts.tv_sec;
2699 2999                          loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
2700 3000                          VERIFY(nvlist_add_uint64_array(*config,
2701 3001                              ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
2702 3002  
2703 3003                          VERIFY(nvlist_add_uint64(*config,
  
    | 
      ↓ open down ↓ | 
    14 lines elided | 
    
      ↑ open up ↑ | 
  
2704 3004                              ZPOOL_CONFIG_ERRCOUNT,
2705 3005                              spa_get_errlog_size(spa)) == 0);
2706 3006  
2707 3007                          if (spa_suspended(spa))
2708 3008                                  VERIFY(nvlist_add_uint64(*config,
2709 3009                                      ZPOOL_CONFIG_SUSPENDED,
2710 3010                                      spa->spa_failmode) == 0);
2711 3011  
2712 3012                          spa_add_spares(spa, *config);
2713 3013                          spa_add_l2cache(spa, *config);
     3014 +                        spa_add_feature_stats(spa, *config);
2714 3015                  }
2715 3016          }
2716 3017  
2717 3018          /*
2718 3019           * We want to get the alternate root even for faulted pools, so we cheat
2719 3020           * and call spa_lookup() directly.
2720 3021           */
2721 3022          if (altroot) {
2722 3023                  if (spa == NULL) {
2723 3024                          mutex_enter(&spa_namespace_lock);
2724 3025                          spa = spa_lookup(name);
2725 3026                          if (spa)
2726 3027                                  spa_altroot(spa, altroot, buflen);
2727 3028                          else
2728 3029                                  altroot[0] = '\0';
2729 3030                          spa = NULL;
2730 3031                          mutex_exit(&spa_namespace_lock);
2731 3032                  } else {
2732 3033                          spa_altroot(spa, altroot, buflen);
2733 3034                  }
2734 3035          }
2735 3036  
2736 3037          if (spa != NULL) {
2737 3038                  spa_config_exit(spa, SCL_CONFIG, FTAG);
2738 3039                  spa_close(spa, FTAG);
2739 3040          }
2740 3041  
2741 3042          return (error);
2742 3043  }
2743 3044  
2744 3045  /*
2745 3046   * Validate that the auxiliary device array is well formed.  We must have an
2746 3047   * array of nvlists, each which describes a valid leaf vdev.  If this is an
2747 3048   * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
2748 3049   * specified, as long as they are well-formed.
2749 3050   */
2750 3051  static int
2751 3052  spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
2752 3053      spa_aux_vdev_t *sav, const char *config, uint64_t version,
2753 3054      vdev_labeltype_t label)
2754 3055  {
2755 3056          nvlist_t **dev;
2756 3057          uint_t i, ndev;
2757 3058          vdev_t *vd;
2758 3059          int error;
2759 3060  
2760 3061          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2761 3062  
2762 3063          /*
2763 3064           * It's acceptable to have no devs specified.
2764 3065           */
2765 3066          if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
2766 3067                  return (0);
2767 3068  
2768 3069          if (ndev == 0)
2769 3070                  return (EINVAL);
2770 3071  
2771 3072          /*
2772 3073           * Make sure the pool is formatted with a version that supports this
2773 3074           * device type.
2774 3075           */
2775 3076          if (spa_version(spa) < version)
2776 3077                  return (ENOTSUP);
2777 3078  
2778 3079          /*
2779 3080           * Set the pending device list so we correctly handle device in-use
2780 3081           * checking.
2781 3082           */
2782 3083          sav->sav_pending = dev;
2783 3084          sav->sav_npending = ndev;
2784 3085  
2785 3086          for (i = 0; i < ndev; i++) {
2786 3087                  if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
2787 3088                      mode)) != 0)
2788 3089                          goto out;
2789 3090  
2790 3091                  if (!vd->vdev_ops->vdev_op_leaf) {
2791 3092                          vdev_free(vd);
2792 3093                          error = EINVAL;
2793 3094                          goto out;
2794 3095                  }
2795 3096  
2796 3097                  /*
2797 3098                   * The L2ARC currently only supports disk devices in
2798 3099                   * kernel context.  For user-level testing, we allow it.
2799 3100                   */
2800 3101  #ifdef _KERNEL
2801 3102                  if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
2802 3103                      strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
2803 3104                          error = ENOTBLK;
2804 3105                          vdev_free(vd);
2805 3106                          goto out;
2806 3107                  }
2807 3108  #endif
2808 3109                  vd->vdev_top = vd;
2809 3110  
2810 3111                  if ((error = vdev_open(vd)) == 0 &&
2811 3112                      (error = vdev_label_init(vd, crtxg, label)) == 0) {
2812 3113                          VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
2813 3114                              vd->vdev_guid) == 0);
2814 3115                  }
2815 3116  
2816 3117                  vdev_free(vd);
2817 3118  
2818 3119                  if (error &&
2819 3120                      (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
2820 3121                          goto out;
2821 3122                  else
2822 3123                          error = 0;
2823 3124          }
2824 3125  
2825 3126  out:
2826 3127          sav->sav_pending = NULL;
2827 3128          sav->sav_npending = 0;
2828 3129          return (error);
2829 3130  }
2830 3131  
2831 3132  static int
2832 3133  spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
2833 3134  {
2834 3135          int error;
2835 3136  
2836 3137          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
2837 3138  
2838 3139          if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2839 3140              &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
2840 3141              VDEV_LABEL_SPARE)) != 0) {
2841 3142                  return (error);
2842 3143          }
2843 3144  
2844 3145          return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
2845 3146              &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
2846 3147              VDEV_LABEL_L2CACHE));
2847 3148  }
2848 3149  
2849 3150  static void
2850 3151  spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
2851 3152      const char *config)
2852 3153  {
2853 3154          int i;
2854 3155  
2855 3156          if (sav->sav_config != NULL) {
2856 3157                  nvlist_t **olddevs;
2857 3158                  uint_t oldndevs;
2858 3159                  nvlist_t **newdevs;
2859 3160  
2860 3161                  /*
2861 3162                   * Generate new dev list by concatentating with the
2862 3163                   * current dev list.
2863 3164                   */
2864 3165                  VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
2865 3166                      &olddevs, &oldndevs) == 0);
2866 3167  
2867 3168                  newdevs = kmem_alloc(sizeof (void *) *
2868 3169                      (ndevs + oldndevs), KM_SLEEP);
2869 3170                  for (i = 0; i < oldndevs; i++)
2870 3171                          VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
2871 3172                              KM_SLEEP) == 0);
2872 3173                  for (i = 0; i < ndevs; i++)
2873 3174                          VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
2874 3175                              KM_SLEEP) == 0);
2875 3176  
2876 3177                  VERIFY(nvlist_remove(sav->sav_config, config,
2877 3178                      DATA_TYPE_NVLIST_ARRAY) == 0);
2878 3179  
2879 3180                  VERIFY(nvlist_add_nvlist_array(sav->sav_config,
2880 3181                      config, newdevs, ndevs + oldndevs) == 0);
2881 3182                  for (i = 0; i < oldndevs + ndevs; i++)
2882 3183                          nvlist_free(newdevs[i]);
2883 3184                  kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
2884 3185          } else {
2885 3186                  /*
2886 3187                   * Generate a new dev list.
2887 3188                   */
2888 3189                  VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
2889 3190                      KM_SLEEP) == 0);
2890 3191                  VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
2891 3192                      devs, ndevs) == 0);
2892 3193          }
2893 3194  }
2894 3195  
2895 3196  /*
2896 3197   * Stop and drop level 2 ARC devices
2897 3198   */
2898 3199  void
2899 3200  spa_l2cache_drop(spa_t *spa)
2900 3201  {
2901 3202          vdev_t *vd;
2902 3203          int i;
2903 3204          spa_aux_vdev_t *sav = &spa->spa_l2cache;
2904 3205  
2905 3206          for (i = 0; i < sav->sav_count; i++) {
2906 3207                  uint64_t pool;
2907 3208  
2908 3209                  vd = sav->sav_vdevs[i];
2909 3210                  ASSERT(vd != NULL);
2910 3211  
2911 3212                  if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
2912 3213                      pool != 0ULL && l2arc_vdev_present(vd))
2913 3214                          l2arc_remove_vdev(vd);
2914 3215          }
2915 3216  }
2916 3217  
2917 3218  /*
2918 3219   * Pool Creation
2919 3220   */
2920 3221  int
2921 3222  spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
2922 3223      const char *history_str, nvlist_t *zplprops)
2923 3224  {
  
    | 
      ↓ open down ↓ | 
    200 lines elided | 
    
      ↑ open up ↑ | 
  
2924 3225          spa_t *spa;
2925 3226          char *altroot = NULL;
2926 3227          vdev_t *rvd;
2927 3228          dsl_pool_t *dp;
2928 3229          dmu_tx_t *tx;
2929 3230          int error = 0;
2930 3231          uint64_t txg = TXG_INITIAL;
2931 3232          nvlist_t **spares, **l2cache;
2932 3233          uint_t nspares, nl2cache;
2933 3234          uint64_t version, obj;
     3235 +        boolean_t has_features;
2934 3236  
2935 3237          /*
2936 3238           * If this pool already exists, return failure.
2937 3239           */
2938 3240          mutex_enter(&spa_namespace_lock);
2939 3241          if (spa_lookup(pool) != NULL) {
2940 3242                  mutex_exit(&spa_namespace_lock);
2941 3243                  return (EEXIST);
2942 3244          }
2943 3245  
2944 3246          /*
2945 3247           * Allocate a new spa_t structure.
2946 3248           */
2947 3249          (void) nvlist_lookup_string(props,
2948 3250              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
  
    | 
      ↓ open down ↓ | 
    5 lines elided | 
    
      ↑ open up ↑ | 
  
2949 3251          spa = spa_add(pool, NULL, altroot);
2950 3252          spa_activate(spa, spa_mode_global);
2951 3253  
2952 3254          if (props && (error = spa_prop_validate(spa, props))) {
2953 3255                  spa_deactivate(spa);
2954 3256                  spa_remove(spa);
2955 3257                  mutex_exit(&spa_namespace_lock);
2956 3258                  return (error);
2957 3259          }
2958 3260  
2959      -        if (nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION),
2960      -            &version) != 0)
     3261 +        has_features = B_FALSE;
     3262 +        for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
     3263 +            elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
     3264 +                if (zpool_prop_feature(nvpair_name(elem)))
     3265 +                        has_features = B_TRUE;
     3266 +        }
     3267 +
     3268 +        if (has_features || nvlist_lookup_uint64(props,
     3269 +            zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
2961 3270                  version = SPA_VERSION;
2962      -        ASSERT(version <= SPA_VERSION);
     3271 +        }
     3272 +        ASSERT(SPA_VERSION_IS_SUPPORTED(version));
2963 3273  
2964 3274          spa->spa_first_txg = txg;
2965 3275          spa->spa_uberblock.ub_txg = txg - 1;
2966 3276          spa->spa_uberblock.ub_version = version;
2967 3277          spa->spa_ubsync = spa->spa_uberblock;
2968 3278  
2969 3279          /*
2970 3280           * Create "The Godfather" zio to hold all async IOs
2971 3281           */
2972 3282          spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
2973 3283              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
2974 3284  
2975 3285          /*
2976 3286           * Create the root vdev.
2977 3287           */
2978 3288          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
2979 3289  
2980 3290          error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
2981 3291  
2982 3292          ASSERT(error != 0 || rvd != NULL);
2983 3293          ASSERT(error != 0 || spa->spa_root_vdev == rvd);
2984 3294  
2985 3295          if (error == 0 && !zfs_allocatable_devs(nvroot))
2986 3296                  error = EINVAL;
2987 3297  
2988 3298          if (error == 0 &&
2989 3299              (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
2990 3300              (error = spa_validate_aux(spa, nvroot, txg,
2991 3301              VDEV_ALLOC_ADD)) == 0) {
2992 3302                  for (int c = 0; c < rvd->vdev_children; c++) {
2993 3303                          vdev_metaslab_set_size(rvd->vdev_child[c]);
2994 3304                          vdev_expand(rvd->vdev_child[c], txg);
2995 3305                  }
2996 3306          }
2997 3307  
2998 3308          spa_config_exit(spa, SCL_ALL, FTAG);
2999 3309  
3000 3310          if (error != 0) {
3001 3311                  spa_unload(spa);
3002 3312                  spa_deactivate(spa);
3003 3313                  spa_remove(spa);
3004 3314                  mutex_exit(&spa_namespace_lock);
3005 3315                  return (error);
3006 3316          }
3007 3317  
3008 3318          /*
3009 3319           * Get the list of spares, if specified.
3010 3320           */
3011 3321          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3012 3322              &spares, &nspares) == 0) {
3013 3323                  VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
3014 3324                      KM_SLEEP) == 0);
3015 3325                  VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3016 3326                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3017 3327                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3018 3328                  spa_load_spares(spa);
3019 3329                  spa_config_exit(spa, SCL_ALL, FTAG);
3020 3330                  spa->spa_spares.sav_sync = B_TRUE;
3021 3331          }
3022 3332  
3023 3333          /*
3024 3334           * Get the list of level 2 cache devices, if specified.
3025 3335           */
3026 3336          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3027 3337              &l2cache, &nl2cache) == 0) {
  
    | 
      ↓ open down ↓ | 
    55 lines elided | 
    
      ↑ open up ↑ | 
  
3028 3338                  VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3029 3339                      NV_UNIQUE_NAME, KM_SLEEP) == 0);
3030 3340                  VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3031 3341                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3032 3342                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3033 3343                  spa_load_l2cache(spa);
3034 3344                  spa_config_exit(spa, SCL_ALL, FTAG);
3035 3345                  spa->spa_l2cache.sav_sync = B_TRUE;
3036 3346          }
3037 3347  
     3348 +        spa->spa_is_initializing = B_TRUE;
3038 3349          spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
3039 3350          spa->spa_meta_objset = dp->dp_meta_objset;
     3351 +        spa->spa_is_initializing = B_FALSE;
3040 3352  
3041 3353          /*
3042 3354           * Create DDTs (dedup tables).
3043 3355           */
3044 3356          ddt_create(spa);
3045 3357  
3046 3358          spa_update_dspace(spa);
3047 3359  
3048 3360          tx = dmu_tx_create_assigned(dp, txg);
3049 3361  
3050 3362          /*
3051 3363           * Create the pool config object.
3052 3364           */
  
    | 
      ↓ open down ↓ | 
    3 lines elided | 
    
      ↑ open up ↑ | 
  
3053 3365          spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
3054 3366              DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
3055 3367              DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
3056 3368  
3057 3369          if (zap_add(spa->spa_meta_objset,
3058 3370              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
3059 3371              sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
3060 3372                  cmn_err(CE_PANIC, "failed to add pool config");
3061 3373          }
3062 3374  
     3375 +        if (spa_version(spa) >= SPA_VERSION_FEATURES)
     3376 +                spa_feature_create_zap_objects(spa, tx);
     3377 +
3063 3378          if (zap_add(spa->spa_meta_objset,
3064 3379              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
3065 3380              sizeof (uint64_t), 1, &version, tx) != 0) {
3066 3381                  cmn_err(CE_PANIC, "failed to add pool version");
3067 3382          }
3068 3383  
3069 3384          /* Newly created pools with the right version are always deflated. */
3070 3385          if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
3071 3386                  spa->spa_deflate = TRUE;
3072 3387                  if (zap_add(spa->spa_meta_objset,
3073 3388                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
3074 3389                      sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
3075 3390                          cmn_err(CE_PANIC, "failed to add deflate");
3076 3391                  }
3077 3392          }
3078 3393  
3079 3394          /*
3080 3395           * Create the deferred-free bpobj.  Turn off compression
3081 3396           * because sync-to-convergence takes longer if the blocksize
3082 3397           * keeps changing.
3083 3398           */
3084 3399          obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
3085 3400          dmu_object_set_compress(spa->spa_meta_objset, obj,
3086 3401              ZIO_COMPRESS_OFF, tx);
3087 3402          if (zap_add(spa->spa_meta_objset,
3088 3403              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
3089 3404              sizeof (uint64_t), 1, &obj, tx) != 0) {
3090 3405                  cmn_err(CE_PANIC, "failed to add bpobj");
3091 3406          }
3092 3407          VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
3093 3408              spa->spa_meta_objset, obj));
3094 3409  
3095 3410          /*
3096 3411           * Create the pool's history object.
3097 3412           */
3098 3413          if (version >= SPA_VERSION_ZPOOL_HISTORY)
3099 3414                  spa_history_create_obj(spa, tx);
3100 3415  
3101 3416          /*
3102 3417           * Set pool properties.
3103 3418           */
3104 3419          spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
3105 3420          spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
3106 3421          spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
3107 3422          spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
3108 3423  
3109 3424          if (props != NULL) {
3110 3425                  spa_configfile_set(spa, props, B_FALSE);
3111 3426                  spa_sync_props(spa, props, tx);
3112 3427          }
3113 3428  
3114 3429          dmu_tx_commit(tx);
3115 3430  
3116 3431          spa->spa_sync_on = B_TRUE;
3117 3432          txg_sync_start(spa->spa_dsl_pool);
3118 3433  
3119 3434          /*
3120 3435           * We explicitly wait for the first transaction to complete so that our
3121 3436           * bean counters are appropriately updated.
3122 3437           */
3123 3438          txg_wait_synced(spa->spa_dsl_pool, txg);
3124 3439  
3125 3440          spa_config_sync(spa, B_FALSE, B_TRUE);
3126 3441  
3127 3442          if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
3128 3443                  (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
3129 3444          spa_history_log_version(spa, LOG_POOL_CREATE);
3130 3445  
3131 3446          spa->spa_minref = refcount_count(&spa->spa_refcount);
3132 3447  
3133 3448          mutex_exit(&spa_namespace_lock);
3134 3449  
3135 3450          return (0);
3136 3451  }
3137 3452  
3138 3453  #ifdef _KERNEL
3139 3454  /*
3140 3455   * Get the root pool information from the root disk, then import the root pool
3141 3456   * during the system boot up time.
3142 3457   */
3143 3458  extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
3144 3459  
3145 3460  static nvlist_t *
3146 3461  spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
3147 3462  {
3148 3463          nvlist_t *config;
3149 3464          nvlist_t *nvtop, *nvroot;
3150 3465          uint64_t pgid;
3151 3466  
3152 3467          if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
3153 3468                  return (NULL);
3154 3469  
3155 3470          /*
3156 3471           * Add this top-level vdev to the child array.
3157 3472           */
3158 3473          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3159 3474              &nvtop) == 0);
3160 3475          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
3161 3476              &pgid) == 0);
3162 3477          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
3163 3478  
3164 3479          /*
3165 3480           * Put this pool's top-level vdevs into a root vdev.
3166 3481           */
3167 3482          VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
3168 3483          VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
3169 3484              VDEV_TYPE_ROOT) == 0);
3170 3485          VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
3171 3486          VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
3172 3487          VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
3173 3488              &nvtop, 1) == 0);
3174 3489  
3175 3490          /*
3176 3491           * Replace the existing vdev_tree with the new root vdev in
3177 3492           * this pool's configuration (remove the old, add the new).
3178 3493           */
3179 3494          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
3180 3495          nvlist_free(nvroot);
3181 3496          return (config);
3182 3497  }
3183 3498  
3184 3499  /*
3185 3500   * Walk the vdev tree and see if we can find a device with "better"
3186 3501   * configuration. A configuration is "better" if the label on that
3187 3502   * device has a more recent txg.
3188 3503   */
3189 3504  static void
3190 3505  spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
3191 3506  {
3192 3507          for (int c = 0; c < vd->vdev_children; c++)
3193 3508                  spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
3194 3509  
3195 3510          if (vd->vdev_ops->vdev_op_leaf) {
3196 3511                  nvlist_t *label;
3197 3512                  uint64_t label_txg;
3198 3513  
3199 3514                  if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
3200 3515                      &label) != 0)
3201 3516                          return;
3202 3517  
3203 3518                  VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
3204 3519                      &label_txg) == 0);
3205 3520  
3206 3521                  /*
3207 3522                   * Do we have a better boot device?
3208 3523                   */
3209 3524                  if (label_txg > *txg) {
3210 3525                          *txg = label_txg;
3211 3526                          *avd = vd;
3212 3527                  }
3213 3528                  nvlist_free(label);
3214 3529          }
3215 3530  }
3216 3531  
3217 3532  /*
3218 3533   * Import a root pool.
3219 3534   *
3220 3535   * For x86. devpath_list will consist of devid and/or physpath name of
3221 3536   * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
3222 3537   * The GRUB "findroot" command will return the vdev we should boot.
3223 3538   *
3224 3539   * For Sparc, devpath_list consists the physpath name of the booting device
3225 3540   * no matter the rootpool is a single device pool or a mirrored pool.
3226 3541   * e.g.
3227 3542   *      "/pci@1f,0/ide@d/disk@0,0:a"
3228 3543   */
3229 3544  int
3230 3545  spa_import_rootpool(char *devpath, char *devid)
3231 3546  {
3232 3547          spa_t *spa;
3233 3548          vdev_t *rvd, *bvd, *avd = NULL;
3234 3549          nvlist_t *config, *nvtop;
3235 3550          uint64_t guid, txg;
3236 3551          char *pname;
3237 3552          int error;
3238 3553  
3239 3554          /*
3240 3555           * Read the label from the boot device and generate a configuration.
3241 3556           */
3242 3557          config = spa_generate_rootconf(devpath, devid, &guid);
  
    | 
      ↓ open down ↓ | 
    170 lines elided | 
    
      ↑ open up ↑ | 
  
3243 3558  #if defined(_OBP) && defined(_KERNEL)
3244 3559          if (config == NULL) {
3245 3560                  if (strstr(devpath, "/iscsi/ssd") != NULL) {
3246 3561                          /* iscsi boot */
3247 3562                          get_iscsi_bootpath_phy(devpath);
3248 3563                          config = spa_generate_rootconf(devpath, devid, &guid);
3249 3564                  }
3250 3565          }
3251 3566  #endif
3252 3567          if (config == NULL) {
3253      -                cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
     3568 +                cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
3254 3569                      devpath);
3255 3570                  return (EIO);
3256 3571          }
3257 3572  
3258 3573          VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
3259 3574              &pname) == 0);
3260 3575          VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
3261 3576  
3262 3577          mutex_enter(&spa_namespace_lock);
3263 3578          if ((spa = spa_lookup(pname)) != NULL) {
3264 3579                  /*
3265 3580                   * Remove the existing root pool from the namespace so that we
3266 3581                   * can replace it with the correct config we just read in.
3267 3582                   */
3268 3583                  spa_remove(spa);
3269 3584          }
3270 3585  
3271 3586          spa = spa_add(pname, config, NULL);
3272 3587          spa->spa_is_root = B_TRUE;
3273 3588          spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
3274 3589  
3275 3590          /*
3276 3591           * Build up a vdev tree based on the boot device's label config.
3277 3592           */
3278 3593          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3279 3594              &nvtop) == 0);
3280 3595          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3281 3596          error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
3282 3597              VDEV_ALLOC_ROOTPOOL);
3283 3598          spa_config_exit(spa, SCL_ALL, FTAG);
3284 3599          if (error) {
3285 3600                  mutex_exit(&spa_namespace_lock);
3286 3601                  nvlist_free(config);
3287 3602                  cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
3288 3603                      pname);
3289 3604                  return (error);
3290 3605          }
3291 3606  
3292 3607          /*
3293 3608           * Get the boot vdev.
3294 3609           */
3295 3610          if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
3296 3611                  cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
3297 3612                      (u_longlong_t)guid);
3298 3613                  error = ENOENT;
3299 3614                  goto out;
3300 3615          }
3301 3616  
3302 3617          /*
3303 3618           * Determine if there is a better boot device.
3304 3619           */
3305 3620          avd = bvd;
3306 3621          spa_alt_rootvdev(rvd, &avd, &txg);
3307 3622          if (avd != bvd) {
3308 3623                  cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
3309 3624                      "try booting from '%s'", avd->vdev_path);
3310 3625                  error = EINVAL;
3311 3626                  goto out;
3312 3627          }
3313 3628  
3314 3629          /*
3315 3630           * If the boot device is part of a spare vdev then ensure that
3316 3631           * we're booting off the active spare.
3317 3632           */
3318 3633          if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
3319 3634              !bvd->vdev_isspare) {
3320 3635                  cmn_err(CE_NOTE, "The boot device is currently spared. Please "
3321 3636                      "try booting from '%s'",
3322 3637                      bvd->vdev_parent->
3323 3638                      vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
3324 3639                  error = EINVAL;
3325 3640                  goto out;
3326 3641          }
3327 3642  
3328 3643          error = 0;
3329 3644          spa_history_log_version(spa, LOG_POOL_IMPORT);
3330 3645  out:
3331 3646          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3332 3647          vdev_free(rvd);
3333 3648          spa_config_exit(spa, SCL_ALL, FTAG);
3334 3649          mutex_exit(&spa_namespace_lock);
3335 3650  
3336 3651          nvlist_free(config);
3337 3652          return (error);
3338 3653  }
3339 3654  
3340 3655  #endif
3341 3656  
3342 3657  /*
3343 3658   * Import a non-root pool into the system.
3344 3659   */
3345 3660  int
3346 3661  spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
3347 3662  {
3348 3663          spa_t *spa;
3349 3664          char *altroot = NULL;
3350 3665          spa_load_state_t state = SPA_LOAD_IMPORT;
3351 3666          zpool_rewind_policy_t policy;
3352 3667          uint64_t mode = spa_mode_global;
3353 3668          uint64_t readonly = B_FALSE;
3354 3669          int error;
3355 3670          nvlist_t *nvroot;
3356 3671          nvlist_t **spares, **l2cache;
3357 3672          uint_t nspares, nl2cache;
3358 3673  
3359 3674          /*
3360 3675           * If a pool with this name exists, return failure.
3361 3676           */
3362 3677          mutex_enter(&spa_namespace_lock);
3363 3678          if (spa_lookup(pool) != NULL) {
3364 3679                  mutex_exit(&spa_namespace_lock);
3365 3680                  return (EEXIST);
3366 3681          }
3367 3682  
3368 3683          /*
3369 3684           * Create and initialize the spa structure.
3370 3685           */
3371 3686          (void) nvlist_lookup_string(props,
3372 3687              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
3373 3688          (void) nvlist_lookup_uint64(props,
3374 3689              zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
3375 3690          if (readonly)
3376 3691                  mode = FREAD;
3377 3692          spa = spa_add(pool, config, altroot);
3378 3693          spa->spa_import_flags = flags;
3379 3694  
3380 3695          /*
3381 3696           * Verbatim import - Take a pool and insert it into the namespace
3382 3697           * as if it had been loaded at boot.
3383 3698           */
3384 3699          if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
3385 3700                  if (props != NULL)
3386 3701                          spa_configfile_set(spa, props, B_FALSE);
3387 3702  
3388 3703                  spa_config_sync(spa, B_FALSE, B_TRUE);
3389 3704  
3390 3705                  mutex_exit(&spa_namespace_lock);
3391 3706                  spa_history_log_version(spa, LOG_POOL_IMPORT);
3392 3707  
3393 3708                  return (0);
3394 3709          }
3395 3710  
3396 3711          spa_activate(spa, mode);
3397 3712  
3398 3713          /*
3399 3714           * Don't start async tasks until we know everything is healthy.
3400 3715           */
3401 3716          spa_async_suspend(spa);
3402 3717  
3403 3718          zpool_get_rewind_policy(config, &policy);
3404 3719          if (policy.zrp_request & ZPOOL_DO_REWIND)
3405 3720                  state = SPA_LOAD_RECOVER;
3406 3721  
3407 3722          /*
3408 3723           * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
3409 3724           * because the user-supplied config is actually the one to trust when
3410 3725           * doing an import.
3411 3726           */
3412 3727          if (state != SPA_LOAD_RECOVER)
3413 3728                  spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
3414 3729  
3415 3730          error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
3416 3731              policy.zrp_request);
3417 3732  
3418 3733          /*
3419 3734           * Propagate anything learned while loading the pool and pass it
3420 3735           * back to caller (i.e. rewind info, missing devices, etc).
3421 3736           */
3422 3737          VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
3423 3738              spa->spa_load_info) == 0);
3424 3739  
3425 3740          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3426 3741          /*
3427 3742           * Toss any existing sparelist, as it doesn't have any validity
3428 3743           * anymore, and conflicts with spa_has_spare().
3429 3744           */
3430 3745          if (spa->spa_spares.sav_config) {
3431 3746                  nvlist_free(spa->spa_spares.sav_config);
3432 3747                  spa->spa_spares.sav_config = NULL;
3433 3748                  spa_load_spares(spa);
3434 3749          }
3435 3750          if (spa->spa_l2cache.sav_config) {
3436 3751                  nvlist_free(spa->spa_l2cache.sav_config);
3437 3752                  spa->spa_l2cache.sav_config = NULL;
3438 3753                  spa_load_l2cache(spa);
3439 3754          }
3440 3755  
3441 3756          VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
3442 3757              &nvroot) == 0);
3443 3758          if (error == 0)
3444 3759                  error = spa_validate_aux(spa, nvroot, -1ULL,
3445 3760                      VDEV_ALLOC_SPARE);
3446 3761          if (error == 0)
3447 3762                  error = spa_validate_aux(spa, nvroot, -1ULL,
3448 3763                      VDEV_ALLOC_L2CACHE);
3449 3764          spa_config_exit(spa, SCL_ALL, FTAG);
3450 3765  
3451 3766          if (props != NULL)
3452 3767                  spa_configfile_set(spa, props, B_FALSE);
3453 3768  
3454 3769          if (error != 0 || (props && spa_writeable(spa) &&
3455 3770              (error = spa_prop_set(spa, props)))) {
3456 3771                  spa_unload(spa);
3457 3772                  spa_deactivate(spa);
3458 3773                  spa_remove(spa);
3459 3774                  mutex_exit(&spa_namespace_lock);
3460 3775                  return (error);
3461 3776          }
3462 3777  
3463 3778          spa_async_resume(spa);
3464 3779  
3465 3780          /*
3466 3781           * Override any spares and level 2 cache devices as specified by
3467 3782           * the user, as these may have correct device names/devids, etc.
3468 3783           */
3469 3784          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
3470 3785              &spares, &nspares) == 0) {
3471 3786                  if (spa->spa_spares.sav_config)
3472 3787                          VERIFY(nvlist_remove(spa->spa_spares.sav_config,
3473 3788                              ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
3474 3789                  else
3475 3790                          VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
3476 3791                              NV_UNIQUE_NAME, KM_SLEEP) == 0);
3477 3792                  VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
3478 3793                      ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
3479 3794                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3480 3795                  spa_load_spares(spa);
3481 3796                  spa_config_exit(spa, SCL_ALL, FTAG);
3482 3797                  spa->spa_spares.sav_sync = B_TRUE;
3483 3798          }
3484 3799          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
3485 3800              &l2cache, &nl2cache) == 0) {
3486 3801                  if (spa->spa_l2cache.sav_config)
3487 3802                          VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
3488 3803                              ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
3489 3804                  else
3490 3805                          VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
3491 3806                              NV_UNIQUE_NAME, KM_SLEEP) == 0);
3492 3807                  VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
3493 3808                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
3494 3809                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3495 3810                  spa_load_l2cache(spa);
3496 3811                  spa_config_exit(spa, SCL_ALL, FTAG);
3497 3812                  spa->spa_l2cache.sav_sync = B_TRUE;
3498 3813          }
3499 3814  
3500 3815          /*
3501 3816           * Check for any removed devices.
3502 3817           */
3503 3818          if (spa->spa_autoreplace) {
3504 3819                  spa_aux_check_removed(&spa->spa_spares);
3505 3820                  spa_aux_check_removed(&spa->spa_l2cache);
3506 3821          }
3507 3822  
3508 3823          if (spa_writeable(spa)) {
3509 3824                  /*
3510 3825                   * Update the config cache to include the newly-imported pool.
3511 3826                   */
3512 3827                  spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3513 3828          }
3514 3829  
3515 3830          /*
3516 3831           * It's possible that the pool was expanded while it was exported.
3517 3832           * We kick off an async task to handle this for us.
3518 3833           */
3519 3834          spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
3520 3835  
3521 3836          mutex_exit(&spa_namespace_lock);
3522 3837          spa_history_log_version(spa, LOG_POOL_IMPORT);
3523 3838  
3524 3839          return (0);
3525 3840  }
3526 3841  
3527 3842  nvlist_t *
3528 3843  spa_tryimport(nvlist_t *tryconfig)
3529 3844  {
3530 3845          nvlist_t *config = NULL;
3531 3846          char *poolname;
3532 3847          spa_t *spa;
3533 3848          uint64_t state;
3534 3849          int error;
3535 3850  
3536 3851          if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
3537 3852                  return (NULL);
3538 3853  
3539 3854          if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
3540 3855                  return (NULL);
3541 3856  
3542 3857          /*
3543 3858           * Create and initialize the spa structure.
3544 3859           */
3545 3860          mutex_enter(&spa_namespace_lock);
3546 3861          spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
3547 3862          spa_activate(spa, FREAD);
3548 3863  
3549 3864          /*
3550 3865           * Pass off the heavy lifting to spa_load().
3551 3866           * Pass TRUE for mosconfig because the user-supplied config
3552 3867           * is actually the one to trust when doing an import.
3553 3868           */
3554 3869          error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
3555 3870  
3556 3871          /*
  
    | 
      ↓ open down ↓ | 
    293 lines elided | 
    
      ↑ open up ↑ | 
  
3557 3872           * If 'tryconfig' was at least parsable, return the current config.
3558 3873           */
3559 3874          if (spa->spa_root_vdev != NULL) {
3560 3875                  config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
3561 3876                  VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
3562 3877                      poolname) == 0);
3563 3878                  VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
3564 3879                      state) == 0);
3565 3880                  VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
3566 3881                      spa->spa_uberblock.ub_timestamp) == 0);
     3882 +                VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
     3883 +                    spa->spa_load_info) == 0);
3567 3884  
3568 3885                  /*
3569 3886                   * If the bootfs property exists on this pool then we
3570 3887                   * copy it out so that external consumers can tell which
3571 3888                   * pools are bootable.
3572 3889                   */
3573 3890                  if ((!error || error == EEXIST) && spa->spa_bootfs) {
3574 3891                          char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3575 3892  
3576 3893                          /*
3577 3894                           * We have to play games with the name since the
3578 3895                           * pool was opened as TRYIMPORT_NAME.
3579 3896                           */
3580 3897                          if (dsl_dsobj_to_dsname(spa_name(spa),
3581 3898                              spa->spa_bootfs, tmpname) == 0) {
3582 3899                                  char *cp;
3583 3900                                  char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
3584 3901  
3585 3902                                  cp = strchr(tmpname, '/');
3586 3903                                  if (cp == NULL) {
3587 3904                                          (void) strlcpy(dsname, tmpname,
3588 3905                                              MAXPATHLEN);
3589 3906                                  } else {
3590 3907                                          (void) snprintf(dsname, MAXPATHLEN,
3591 3908                                              "%s/%s", poolname, ++cp);
3592 3909                                  }
3593 3910                                  VERIFY(nvlist_add_string(config,
3594 3911                                      ZPOOL_CONFIG_BOOTFS, dsname) == 0);
3595 3912                                  kmem_free(dsname, MAXPATHLEN);
3596 3913                          }
3597 3914                          kmem_free(tmpname, MAXPATHLEN);
3598 3915                  }
3599 3916  
3600 3917                  /*
3601 3918                   * Add the list of hot spares and level 2 cache devices.
3602 3919                   */
3603 3920                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
3604 3921                  spa_add_spares(spa, config);
3605 3922                  spa_add_l2cache(spa, config);
3606 3923                  spa_config_exit(spa, SCL_CONFIG, FTAG);
3607 3924          }
3608 3925  
3609 3926          spa_unload(spa);
3610 3927          spa_deactivate(spa);
3611 3928          spa_remove(spa);
3612 3929          mutex_exit(&spa_namespace_lock);
3613 3930  
3614 3931          return (config);
3615 3932  }
3616 3933  
3617 3934  /*
3618 3935   * Pool export/destroy
3619 3936   *
3620 3937   * The act of destroying or exporting a pool is very simple.  We make sure there
3621 3938   * is no more pending I/O and any references to the pool are gone.  Then, we
3622 3939   * update the pool state and sync all the labels to disk, removing the
3623 3940   * configuration from the cache afterwards. If the 'hardforce' flag is set, then
3624 3941   * we don't sync the labels or remove the configuration cache.
3625 3942   */
3626 3943  static int
3627 3944  spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
3628 3945      boolean_t force, boolean_t hardforce)
3629 3946  {
3630 3947          spa_t *spa;
3631 3948  
3632 3949          if (oldconfig)
3633 3950                  *oldconfig = NULL;
3634 3951  
3635 3952          if (!(spa_mode_global & FWRITE))
3636 3953                  return (EROFS);
3637 3954  
3638 3955          mutex_enter(&spa_namespace_lock);
3639 3956          if ((spa = spa_lookup(pool)) == NULL) {
3640 3957                  mutex_exit(&spa_namespace_lock);
3641 3958                  return (ENOENT);
3642 3959          }
3643 3960  
3644 3961          /*
3645 3962           * Put a hold on the pool, drop the namespace lock, stop async tasks,
3646 3963           * reacquire the namespace lock, and see if we can export.
3647 3964           */
3648 3965          spa_open_ref(spa, FTAG);
3649 3966          mutex_exit(&spa_namespace_lock);
3650 3967          spa_async_suspend(spa);
3651 3968          mutex_enter(&spa_namespace_lock);
3652 3969          spa_close(spa, FTAG);
3653 3970  
3654 3971          /*
3655 3972           * The pool will be in core if it's openable,
3656 3973           * in which case we can modify its state.
3657 3974           */
3658 3975          if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
3659 3976                  /*
3660 3977                   * Objsets may be open only because they're dirty, so we
3661 3978                   * have to force it to sync before checking spa_refcnt.
3662 3979                   */
3663 3980                  txg_wait_synced(spa->spa_dsl_pool, 0);
3664 3981  
3665 3982                  /*
3666 3983                   * A pool cannot be exported or destroyed if there are active
3667 3984                   * references.  If we are resetting a pool, allow references by
3668 3985                   * fault injection handlers.
3669 3986                   */
3670 3987                  if (!spa_refcount_zero(spa) ||
3671 3988                      (spa->spa_inject_ref != 0 &&
3672 3989                      new_state != POOL_STATE_UNINITIALIZED)) {
3673 3990                          spa_async_resume(spa);
3674 3991                          mutex_exit(&spa_namespace_lock);
3675 3992                          return (EBUSY);
3676 3993                  }
3677 3994  
3678 3995                  /*
3679 3996                   * A pool cannot be exported if it has an active shared spare.
3680 3997                   * This is to prevent other pools stealing the active spare
3681 3998                   * from an exported pool. At user's own will, such pool can
3682 3999                   * be forcedly exported.
3683 4000                   */
3684 4001                  if (!force && new_state == POOL_STATE_EXPORTED &&
3685 4002                      spa_has_active_shared_spare(spa)) {
3686 4003                          spa_async_resume(spa);
3687 4004                          mutex_exit(&spa_namespace_lock);
3688 4005                          return (EXDEV);
3689 4006                  }
3690 4007  
3691 4008                  /*
3692 4009                   * We want this to be reflected on every label,
3693 4010                   * so mark them all dirty.  spa_unload() will do the
3694 4011                   * final sync that pushes these changes out.
3695 4012                   */
3696 4013                  if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
3697 4014                          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
3698 4015                          spa->spa_state = new_state;
3699 4016                          spa->spa_final_txg = spa_last_synced_txg(spa) +
3700 4017                              TXG_DEFER_SIZE + 1;
3701 4018                          vdev_config_dirty(spa->spa_root_vdev);
3702 4019                          spa_config_exit(spa, SCL_ALL, FTAG);
3703 4020                  }
3704 4021          }
3705 4022  
3706 4023          spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
3707 4024  
3708 4025          if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
3709 4026                  spa_unload(spa);
3710 4027                  spa_deactivate(spa);
3711 4028          }
3712 4029  
3713 4030          if (oldconfig && spa->spa_config)
3714 4031                  VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
3715 4032  
3716 4033          if (new_state != POOL_STATE_UNINITIALIZED) {
3717 4034                  if (!hardforce)
3718 4035                          spa_config_sync(spa, B_TRUE, B_TRUE);
3719 4036                  spa_remove(spa);
3720 4037          }
3721 4038          mutex_exit(&spa_namespace_lock);
3722 4039  
3723 4040          return (0);
3724 4041  }
3725 4042  
3726 4043  /*
3727 4044   * Destroy a storage pool.
3728 4045   */
3729 4046  int
3730 4047  spa_destroy(char *pool)
3731 4048  {
3732 4049          return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
3733 4050              B_FALSE, B_FALSE));
3734 4051  }
3735 4052  
3736 4053  /*
3737 4054   * Export a storage pool.
3738 4055   */
3739 4056  int
3740 4057  spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
3741 4058      boolean_t hardforce)
3742 4059  {
3743 4060          return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
3744 4061              force, hardforce));
3745 4062  }
3746 4063  
3747 4064  /*
3748 4065   * Similar to spa_export(), this unloads the spa_t without actually removing it
3749 4066   * from the namespace in any way.
3750 4067   */
3751 4068  int
3752 4069  spa_reset(char *pool)
3753 4070  {
3754 4071          return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
3755 4072              B_FALSE, B_FALSE));
3756 4073  }
3757 4074  
3758 4075  /*
3759 4076   * ==========================================================================
3760 4077   * Device manipulation
3761 4078   * ==========================================================================
3762 4079   */
3763 4080  
3764 4081  /*
3765 4082   * Add a device to a storage pool.
3766 4083   */
3767 4084  int
3768 4085  spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
3769 4086  {
3770 4087          uint64_t txg, id;
3771 4088          int error;
3772 4089          vdev_t *rvd = spa->spa_root_vdev;
3773 4090          vdev_t *vd, *tvd;
3774 4091          nvlist_t **spares, **l2cache;
3775 4092          uint_t nspares, nl2cache;
3776 4093  
3777 4094          ASSERT(spa_writeable(spa));
3778 4095  
3779 4096          txg = spa_vdev_enter(spa);
3780 4097  
3781 4098          if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
3782 4099              VDEV_ALLOC_ADD)) != 0)
3783 4100                  return (spa_vdev_exit(spa, NULL, txg, error));
3784 4101  
3785 4102          spa->spa_pending_vdev = vd;     /* spa_vdev_exit() will clear this */
3786 4103  
3787 4104          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
3788 4105              &nspares) != 0)
3789 4106                  nspares = 0;
3790 4107  
3791 4108          if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
3792 4109              &nl2cache) != 0)
3793 4110                  nl2cache = 0;
3794 4111  
3795 4112          if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
3796 4113                  return (spa_vdev_exit(spa, vd, txg, EINVAL));
3797 4114  
3798 4115          if (vd->vdev_children != 0 &&
3799 4116              (error = vdev_create(vd, txg, B_FALSE)) != 0)
3800 4117                  return (spa_vdev_exit(spa, vd, txg, error));
3801 4118  
3802 4119          /*
3803 4120           * We must validate the spares and l2cache devices after checking the
3804 4121           * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
3805 4122           */
3806 4123          if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
3807 4124                  return (spa_vdev_exit(spa, vd, txg, error));
3808 4125  
3809 4126          /*
3810 4127           * Transfer each new top-level vdev from vd to rvd.
3811 4128           */
3812 4129          for (int c = 0; c < vd->vdev_children; c++) {
3813 4130  
3814 4131                  /*
3815 4132                   * Set the vdev id to the first hole, if one exists.
3816 4133                   */
3817 4134                  for (id = 0; id < rvd->vdev_children; id++) {
3818 4135                          if (rvd->vdev_child[id]->vdev_ishole) {
3819 4136                                  vdev_free(rvd->vdev_child[id]);
3820 4137                                  break;
3821 4138                          }
3822 4139                  }
3823 4140                  tvd = vd->vdev_child[c];
3824 4141                  vdev_remove_child(vd, tvd);
3825 4142                  tvd->vdev_id = id;
3826 4143                  vdev_add_child(rvd, tvd);
3827 4144                  vdev_config_dirty(tvd);
3828 4145          }
3829 4146  
3830 4147          if (nspares != 0) {
3831 4148                  spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
3832 4149                      ZPOOL_CONFIG_SPARES);
3833 4150                  spa_load_spares(spa);
3834 4151                  spa->spa_spares.sav_sync = B_TRUE;
3835 4152          }
3836 4153  
3837 4154          if (nl2cache != 0) {
3838 4155                  spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
3839 4156                      ZPOOL_CONFIG_L2CACHE);
3840 4157                  spa_load_l2cache(spa);
3841 4158                  spa->spa_l2cache.sav_sync = B_TRUE;
3842 4159          }
3843 4160  
3844 4161          /*
3845 4162           * We have to be careful when adding new vdevs to an existing pool.
3846 4163           * If other threads start allocating from these vdevs before we
3847 4164           * sync the config cache, and we lose power, then upon reboot we may
3848 4165           * fail to open the pool because there are DVAs that the config cache
3849 4166           * can't translate.  Therefore, we first add the vdevs without
3850 4167           * initializing metaslabs; sync the config cache (via spa_vdev_exit());
3851 4168           * and then let spa_config_update() initialize the new metaslabs.
3852 4169           *
3853 4170           * spa_load() checks for added-but-not-initialized vdevs, so that
3854 4171           * if we lose power at any point in this sequence, the remaining
3855 4172           * steps will be completed the next time we load the pool.
3856 4173           */
3857 4174          (void) spa_vdev_exit(spa, vd, txg, 0);
3858 4175  
3859 4176          mutex_enter(&spa_namespace_lock);
3860 4177          spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
3861 4178          mutex_exit(&spa_namespace_lock);
3862 4179  
3863 4180          return (0);
3864 4181  }
3865 4182  
3866 4183  /*
3867 4184   * Attach a device to a mirror.  The arguments are the path to any device
3868 4185   * in the mirror, and the nvroot for the new device.  If the path specifies
3869 4186   * a device that is not mirrored, we automatically insert the mirror vdev.
3870 4187   *
3871 4188   * If 'replacing' is specified, the new device is intended to replace the
3872 4189   * existing device; in this case the two devices are made into their own
3873 4190   * mirror using the 'replacing' vdev, which is functionally identical to
3874 4191   * the mirror vdev (it actually reuses all the same ops) but has a few
3875 4192   * extra rules: you can't attach to it after it's been created, and upon
3876 4193   * completion of resilvering, the first disk (the one being replaced)
3877 4194   * is automatically detached.
3878 4195   */
3879 4196  int
3880 4197  spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
3881 4198  {
3882 4199          uint64_t txg, dtl_max_txg;
3883 4200          vdev_t *rvd = spa->spa_root_vdev;
3884 4201          vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
3885 4202          vdev_ops_t *pvops;
3886 4203          char *oldvdpath, *newvdpath;
3887 4204          int newvd_isspare;
3888 4205          int error;
3889 4206  
3890 4207          ASSERT(spa_writeable(spa));
3891 4208  
3892 4209          txg = spa_vdev_enter(spa);
3893 4210  
3894 4211          oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
3895 4212  
3896 4213          if (oldvd == NULL)
3897 4214                  return (spa_vdev_exit(spa, NULL, txg, ENODEV));
3898 4215  
3899 4216          if (!oldvd->vdev_ops->vdev_op_leaf)
3900 4217                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
3901 4218  
3902 4219          pvd = oldvd->vdev_parent;
3903 4220  
3904 4221          if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
3905 4222              VDEV_ALLOC_ATTACH)) != 0)
3906 4223                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
3907 4224  
3908 4225          if (newrootvd->vdev_children != 1)
3909 4226                  return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3910 4227  
3911 4228          newvd = newrootvd->vdev_child[0];
3912 4229  
3913 4230          if (!newvd->vdev_ops->vdev_op_leaf)
3914 4231                  return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
3915 4232  
3916 4233          if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
3917 4234                  return (spa_vdev_exit(spa, newrootvd, txg, error));
3918 4235  
3919 4236          /*
3920 4237           * Spares can't replace logs
3921 4238           */
3922 4239          if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
3923 4240                  return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3924 4241  
3925 4242          if (!replacing) {
3926 4243                  /*
3927 4244                   * For attach, the only allowable parent is a mirror or the root
3928 4245                   * vdev.
3929 4246                   */
3930 4247                  if (pvd->vdev_ops != &vdev_mirror_ops &&
3931 4248                      pvd->vdev_ops != &vdev_root_ops)
3932 4249                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3933 4250  
3934 4251                  pvops = &vdev_mirror_ops;
3935 4252          } else {
3936 4253                  /*
3937 4254                   * Active hot spares can only be replaced by inactive hot
3938 4255                   * spares.
3939 4256                   */
3940 4257                  if (pvd->vdev_ops == &vdev_spare_ops &&
3941 4258                      oldvd->vdev_isspare &&
3942 4259                      !spa_has_spare(spa, newvd->vdev_guid))
3943 4260                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3944 4261  
3945 4262                  /*
3946 4263                   * If the source is a hot spare, and the parent isn't already a
3947 4264                   * spare, then we want to create a new hot spare.  Otherwise, we
3948 4265                   * want to create a replacing vdev.  The user is not allowed to
3949 4266                   * attach to a spared vdev child unless the 'isspare' state is
3950 4267                   * the same (spare replaces spare, non-spare replaces
3951 4268                   * non-spare).
3952 4269                   */
3953 4270                  if (pvd->vdev_ops == &vdev_replacing_ops &&
3954 4271                      spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
3955 4272                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3956 4273                  } else if (pvd->vdev_ops == &vdev_spare_ops &&
3957 4274                      newvd->vdev_isspare != oldvd->vdev_isspare) {
3958 4275                          return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
3959 4276                  }
3960 4277  
3961 4278                  if (newvd->vdev_isspare)
3962 4279                          pvops = &vdev_spare_ops;
3963 4280                  else
3964 4281                          pvops = &vdev_replacing_ops;
3965 4282          }
3966 4283  
3967 4284          /*
3968 4285           * Make sure the new device is big enough.
3969 4286           */
3970 4287          if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
3971 4288                  return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
3972 4289  
3973 4290          /*
3974 4291           * The new device cannot have a higher alignment requirement
3975 4292           * than the top-level vdev.
3976 4293           */
3977 4294          if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
3978 4295                  return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
3979 4296  
3980 4297          /*
3981 4298           * If this is an in-place replacement, update oldvd's path and devid
3982 4299           * to make it distinguishable from newvd, and unopenable from now on.
3983 4300           */
3984 4301          if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
3985 4302                  spa_strfree(oldvd->vdev_path);
3986 4303                  oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
3987 4304                      KM_SLEEP);
3988 4305                  (void) sprintf(oldvd->vdev_path, "%s/%s",
3989 4306                      newvd->vdev_path, "old");
3990 4307                  if (oldvd->vdev_devid != NULL) {
3991 4308                          spa_strfree(oldvd->vdev_devid);
3992 4309                          oldvd->vdev_devid = NULL;
3993 4310                  }
3994 4311          }
3995 4312  
3996 4313          /* mark the device being resilvered */
3997 4314          newvd->vdev_resilvering = B_TRUE;
3998 4315  
3999 4316          /*
4000 4317           * If the parent is not a mirror, or if we're replacing, insert the new
4001 4318           * mirror/replacing/spare vdev above oldvd.
4002 4319           */
4003 4320          if (pvd->vdev_ops != pvops)
4004 4321                  pvd = vdev_add_parent(oldvd, pvops);
4005 4322  
4006 4323          ASSERT(pvd->vdev_top->vdev_parent == rvd);
4007 4324          ASSERT(pvd->vdev_ops == pvops);
4008 4325          ASSERT(oldvd->vdev_parent == pvd);
4009 4326  
4010 4327          /*
4011 4328           * Extract the new device from its root and add it to pvd.
4012 4329           */
4013 4330          vdev_remove_child(newrootvd, newvd);
4014 4331          newvd->vdev_id = pvd->vdev_children;
4015 4332          newvd->vdev_crtxg = oldvd->vdev_crtxg;
4016 4333          vdev_add_child(pvd, newvd);
4017 4334  
4018 4335          tvd = newvd->vdev_top;
4019 4336          ASSERT(pvd->vdev_top == tvd);
4020 4337          ASSERT(tvd->vdev_parent == rvd);
4021 4338  
4022 4339          vdev_config_dirty(tvd);
4023 4340  
4024 4341          /*
4025 4342           * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
4026 4343           * for any dmu_sync-ed blocks.  It will propagate upward when
4027 4344           * spa_vdev_exit() calls vdev_dtl_reassess().
4028 4345           */
4029 4346          dtl_max_txg = txg + TXG_CONCURRENT_STATES;
4030 4347  
4031 4348          vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
4032 4349              dtl_max_txg - TXG_INITIAL);
4033 4350  
4034 4351          if (newvd->vdev_isspare) {
4035 4352                  spa_spare_activate(newvd);
4036 4353                  spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
4037 4354          }
4038 4355  
4039 4356          oldvdpath = spa_strdup(oldvd->vdev_path);
4040 4357          newvdpath = spa_strdup(newvd->vdev_path);
4041 4358          newvd_isspare = newvd->vdev_isspare;
4042 4359  
4043 4360          /*
4044 4361           * Mark newvd's DTL dirty in this txg.
4045 4362           */
4046 4363          vdev_dirty(tvd, VDD_DTL, newvd, txg);
4047 4364  
4048 4365          /*
4049 4366           * Restart the resilver
4050 4367           */
4051 4368          dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
4052 4369  
4053 4370          /*
4054 4371           * Commit the config
4055 4372           */
4056 4373          (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
4057 4374  
4058 4375          spa_history_log_internal(LOG_POOL_VDEV_ATTACH, spa, NULL,
4059 4376              "%s vdev=%s %s vdev=%s",
4060 4377              replacing && newvd_isspare ? "spare in" :
4061 4378              replacing ? "replace" : "attach", newvdpath,
4062 4379              replacing ? "for" : "to", oldvdpath);
4063 4380  
4064 4381          spa_strfree(oldvdpath);
4065 4382          spa_strfree(newvdpath);
4066 4383  
4067 4384          if (spa->spa_bootfs)
4068 4385                  spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
4069 4386  
4070 4387          return (0);
4071 4388  }
4072 4389  
4073 4390  /*
4074 4391   * Detach a device from a mirror or replacing vdev.
4075 4392   * If 'replace_done' is specified, only detach if the parent
4076 4393   * is a replacing vdev.
4077 4394   */
4078 4395  int
4079 4396  spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
4080 4397  {
4081 4398          uint64_t txg;
4082 4399          int error;
4083 4400          vdev_t *rvd = spa->spa_root_vdev;
4084 4401          vdev_t *vd, *pvd, *cvd, *tvd;
4085 4402          boolean_t unspare = B_FALSE;
4086 4403          uint64_t unspare_guid;
4087 4404          char *vdpath;
4088 4405  
4089 4406          ASSERT(spa_writeable(spa));
4090 4407  
4091 4408          txg = spa_vdev_enter(spa);
4092 4409  
4093 4410          vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4094 4411  
4095 4412          if (vd == NULL)
4096 4413                  return (spa_vdev_exit(spa, NULL, txg, ENODEV));
4097 4414  
4098 4415          if (!vd->vdev_ops->vdev_op_leaf)
4099 4416                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4100 4417  
4101 4418          pvd = vd->vdev_parent;
4102 4419  
4103 4420          /*
4104 4421           * If the parent/child relationship is not as expected, don't do it.
4105 4422           * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
4106 4423           * vdev that's replacing B with C.  The user's intent in replacing
4107 4424           * is to go from M(A,B) to M(A,C).  If the user decides to cancel
4108 4425           * the replace by detaching C, the expected behavior is to end up
4109 4426           * M(A,B).  But suppose that right after deciding to detach C,
4110 4427           * the replacement of B completes.  We would have M(A,C), and then
4111 4428           * ask to detach C, which would leave us with just A -- not what
4112 4429           * the user wanted.  To prevent this, we make sure that the
4113 4430           * parent/child relationship hasn't changed -- in this example,
4114 4431           * that C's parent is still the replacing vdev R.
4115 4432           */
4116 4433          if (pvd->vdev_guid != pguid && pguid != 0)
4117 4434                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4118 4435  
4119 4436          /*
4120 4437           * Only 'replacing' or 'spare' vdevs can be replaced.
4121 4438           */
4122 4439          if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
4123 4440              pvd->vdev_ops != &vdev_spare_ops)
4124 4441                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4125 4442  
4126 4443          ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
4127 4444              spa_version(spa) >= SPA_VERSION_SPARES);
4128 4445  
4129 4446          /*
4130 4447           * Only mirror, replacing, and spare vdevs support detach.
4131 4448           */
4132 4449          if (pvd->vdev_ops != &vdev_replacing_ops &&
4133 4450              pvd->vdev_ops != &vdev_mirror_ops &&
4134 4451              pvd->vdev_ops != &vdev_spare_ops)
4135 4452                  return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
4136 4453  
4137 4454          /*
4138 4455           * If this device has the only valid copy of some data,
4139 4456           * we cannot safely detach it.
4140 4457           */
4141 4458          if (vdev_dtl_required(vd))
4142 4459                  return (spa_vdev_exit(spa, NULL, txg, EBUSY));
4143 4460  
4144 4461          ASSERT(pvd->vdev_children >= 2);
4145 4462  
4146 4463          /*
4147 4464           * If we are detaching the second disk from a replacing vdev, then
4148 4465           * check to see if we changed the original vdev's path to have "/old"
4149 4466           * at the end in spa_vdev_attach().  If so, undo that change now.
4150 4467           */
4151 4468          if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
4152 4469              vd->vdev_path != NULL) {
4153 4470                  size_t len = strlen(vd->vdev_path);
4154 4471  
4155 4472                  for (int c = 0; c < pvd->vdev_children; c++) {
4156 4473                          cvd = pvd->vdev_child[c];
4157 4474  
4158 4475                          if (cvd == vd || cvd->vdev_path == NULL)
4159 4476                                  continue;
4160 4477  
4161 4478                          if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
4162 4479                              strcmp(cvd->vdev_path + len, "/old") == 0) {
4163 4480                                  spa_strfree(cvd->vdev_path);
4164 4481                                  cvd->vdev_path = spa_strdup(vd->vdev_path);
4165 4482                                  break;
4166 4483                          }
4167 4484                  }
4168 4485          }
4169 4486  
4170 4487          /*
4171 4488           * If we are detaching the original disk from a spare, then it implies
4172 4489           * that the spare should become a real disk, and be removed from the
4173 4490           * active spare list for the pool.
4174 4491           */
4175 4492          if (pvd->vdev_ops == &vdev_spare_ops &&
4176 4493              vd->vdev_id == 0 &&
4177 4494              pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
4178 4495                  unspare = B_TRUE;
4179 4496  
4180 4497          /*
4181 4498           * Erase the disk labels so the disk can be used for other things.
4182 4499           * This must be done after all other error cases are handled,
4183 4500           * but before we disembowel vd (so we can still do I/O to it).
4184 4501           * But if we can't do it, don't treat the error as fatal --
4185 4502           * it may be that the unwritability of the disk is the reason
4186 4503           * it's being detached!
4187 4504           */
4188 4505          error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4189 4506  
4190 4507          /*
4191 4508           * Remove vd from its parent and compact the parent's children.
4192 4509           */
4193 4510          vdev_remove_child(pvd, vd);
4194 4511          vdev_compact_children(pvd);
4195 4512  
4196 4513          /*
4197 4514           * Remember one of the remaining children so we can get tvd below.
4198 4515           */
4199 4516          cvd = pvd->vdev_child[pvd->vdev_children - 1];
4200 4517  
4201 4518          /*
4202 4519           * If we need to remove the remaining child from the list of hot spares,
4203 4520           * do it now, marking the vdev as no longer a spare in the process.
4204 4521           * We must do this before vdev_remove_parent(), because that can
4205 4522           * change the GUID if it creates a new toplevel GUID.  For a similar
4206 4523           * reason, we must remove the spare now, in the same txg as the detach;
4207 4524           * otherwise someone could attach a new sibling, change the GUID, and
4208 4525           * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
4209 4526           */
4210 4527          if (unspare) {
4211 4528                  ASSERT(cvd->vdev_isspare);
4212 4529                  spa_spare_remove(cvd);
4213 4530                  unspare_guid = cvd->vdev_guid;
4214 4531                  (void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
4215 4532                  cvd->vdev_unspare = B_TRUE;
4216 4533          }
4217 4534  
4218 4535          /*
4219 4536           * If the parent mirror/replacing vdev only has one child,
4220 4537           * the parent is no longer needed.  Remove it from the tree.
4221 4538           */
4222 4539          if (pvd->vdev_children == 1) {
4223 4540                  if (pvd->vdev_ops == &vdev_spare_ops)
4224 4541                          cvd->vdev_unspare = B_FALSE;
4225 4542                  vdev_remove_parent(cvd);
4226 4543                  cvd->vdev_resilvering = B_FALSE;
4227 4544          }
4228 4545  
4229 4546  
4230 4547          /*
4231 4548           * We don't set tvd until now because the parent we just removed
4232 4549           * may have been the previous top-level vdev.
4233 4550           */
4234 4551          tvd = cvd->vdev_top;
4235 4552          ASSERT(tvd->vdev_parent == rvd);
4236 4553  
4237 4554          /*
4238 4555           * Reevaluate the parent vdev state.
4239 4556           */
4240 4557          vdev_propagate_state(cvd);
4241 4558  
4242 4559          /*
4243 4560           * If the 'autoexpand' property is set on the pool then automatically
4244 4561           * try to expand the size of the pool. For example if the device we
4245 4562           * just detached was smaller than the others, it may be possible to
4246 4563           * add metaslabs (i.e. grow the pool). We need to reopen the vdev
4247 4564           * first so that we can obtain the updated sizes of the leaf vdevs.
4248 4565           */
4249 4566          if (spa->spa_autoexpand) {
4250 4567                  vdev_reopen(tvd);
4251 4568                  vdev_expand(tvd, txg);
4252 4569          }
4253 4570  
4254 4571          vdev_config_dirty(tvd);
4255 4572  
4256 4573          /*
4257 4574           * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
4258 4575           * vd->vdev_detached is set and free vd's DTL object in syncing context.
4259 4576           * But first make sure we're not on any *other* txg's DTL list, to
4260 4577           * prevent vd from being accessed after it's freed.
4261 4578           */
4262 4579          vdpath = spa_strdup(vd->vdev_path);
4263 4580          for (int t = 0; t < TXG_SIZE; t++)
4264 4581                  (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
4265 4582          vd->vdev_detached = B_TRUE;
4266 4583          vdev_dirty(tvd, VDD_DTL, vd, txg);
4267 4584  
4268 4585          spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
4269 4586  
4270 4587          /* hang on to the spa before we release the lock */
4271 4588          spa_open_ref(spa, FTAG);
4272 4589  
4273 4590          error = spa_vdev_exit(spa, vd, txg, 0);
4274 4591  
4275 4592          spa_history_log_internal(LOG_POOL_VDEV_DETACH, spa, NULL,
4276 4593              "vdev=%s", vdpath);
4277 4594          spa_strfree(vdpath);
4278 4595  
4279 4596          /*
4280 4597           * If this was the removal of the original device in a hot spare vdev,
4281 4598           * then we want to go through and remove the device from the hot spare
4282 4599           * list of every other pool.
4283 4600           */
4284 4601          if (unspare) {
4285 4602                  spa_t *altspa = NULL;
4286 4603  
4287 4604                  mutex_enter(&spa_namespace_lock);
4288 4605                  while ((altspa = spa_next(altspa)) != NULL) {
4289 4606                          if (altspa->spa_state != POOL_STATE_ACTIVE ||
4290 4607                              altspa == spa)
4291 4608                                  continue;
4292 4609  
4293 4610                          spa_open_ref(altspa, FTAG);
4294 4611                          mutex_exit(&spa_namespace_lock);
4295 4612                          (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
4296 4613                          mutex_enter(&spa_namespace_lock);
4297 4614                          spa_close(altspa, FTAG);
4298 4615                  }
4299 4616                  mutex_exit(&spa_namespace_lock);
4300 4617  
4301 4618                  /* search the rest of the vdevs for spares to remove */
4302 4619                  spa_vdev_resilver_done(spa);
4303 4620          }
4304 4621  
4305 4622          /* all done with the spa; OK to release */
4306 4623          mutex_enter(&spa_namespace_lock);
4307 4624          spa_close(spa, FTAG);
4308 4625          mutex_exit(&spa_namespace_lock);
4309 4626  
4310 4627          return (error);
4311 4628  }
4312 4629  
4313 4630  /*
4314 4631   * Split a set of devices from their mirrors, and create a new pool from them.
4315 4632   */
4316 4633  int
4317 4634  spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
4318 4635      nvlist_t *props, boolean_t exp)
4319 4636  {
4320 4637          int error = 0;
4321 4638          uint64_t txg, *glist;
4322 4639          spa_t *newspa;
4323 4640          uint_t c, children, lastlog;
4324 4641          nvlist_t **child, *nvl, *tmp;
4325 4642          dmu_tx_t *tx;
4326 4643          char *altroot = NULL;
4327 4644          vdev_t *rvd, **vml = NULL;                      /* vdev modify list */
4328 4645          boolean_t activate_slog;
4329 4646  
4330 4647          ASSERT(spa_writeable(spa));
4331 4648  
4332 4649          txg = spa_vdev_enter(spa);
4333 4650  
4334 4651          /* clear the log and flush everything up to now */
4335 4652          activate_slog = spa_passivate_log(spa);
4336 4653          (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4337 4654          error = spa_offline_log(spa);
4338 4655          txg = spa_vdev_config_enter(spa);
4339 4656  
4340 4657          if (activate_slog)
4341 4658                  spa_activate_log(spa);
4342 4659  
4343 4660          if (error != 0)
4344 4661                  return (spa_vdev_exit(spa, NULL, txg, error));
4345 4662  
4346 4663          /* check new spa name before going any further */
4347 4664          if (spa_lookup(newname) != NULL)
4348 4665                  return (spa_vdev_exit(spa, NULL, txg, EEXIST));
4349 4666  
4350 4667          /*
4351 4668           * scan through all the children to ensure they're all mirrors
4352 4669           */
4353 4670          if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
4354 4671              nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
4355 4672              &children) != 0)
4356 4673                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4357 4674  
4358 4675          /* first, check to ensure we've got the right child count */
4359 4676          rvd = spa->spa_root_vdev;
4360 4677          lastlog = 0;
4361 4678          for (c = 0; c < rvd->vdev_children; c++) {
4362 4679                  vdev_t *vd = rvd->vdev_child[c];
4363 4680  
4364 4681                  /* don't count the holes & logs as children */
4365 4682                  if (vd->vdev_islog || vd->vdev_ishole) {
4366 4683                          if (lastlog == 0)
4367 4684                                  lastlog = c;
4368 4685                          continue;
4369 4686                  }
4370 4687  
4371 4688                  lastlog = 0;
4372 4689          }
4373 4690          if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
4374 4691                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4375 4692  
4376 4693          /* next, ensure no spare or cache devices are part of the split */
4377 4694          if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
4378 4695              nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
4379 4696                  return (spa_vdev_exit(spa, NULL, txg, EINVAL));
4380 4697  
4381 4698          vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
4382 4699          glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
4383 4700  
4384 4701          /* then, loop over each vdev and validate it */
4385 4702          for (c = 0; c < children; c++) {
4386 4703                  uint64_t is_hole = 0;
4387 4704  
4388 4705                  (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
4389 4706                      &is_hole);
4390 4707  
4391 4708                  if (is_hole != 0) {
4392 4709                          if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
4393 4710                              spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
4394 4711                                  continue;
4395 4712                          } else {
4396 4713                                  error = EINVAL;
4397 4714                                  break;
4398 4715                          }
4399 4716                  }
4400 4717  
4401 4718                  /* which disk is going to be split? */
4402 4719                  if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
4403 4720                      &glist[c]) != 0) {
4404 4721                          error = EINVAL;
4405 4722                          break;
4406 4723                  }
4407 4724  
4408 4725                  /* look it up in the spa */
4409 4726                  vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
4410 4727                  if (vml[c] == NULL) {
4411 4728                          error = ENODEV;
4412 4729                          break;
4413 4730                  }
4414 4731  
4415 4732                  /* make sure there's nothing stopping the split */
4416 4733                  if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
4417 4734                      vml[c]->vdev_islog ||
4418 4735                      vml[c]->vdev_ishole ||
4419 4736                      vml[c]->vdev_isspare ||
4420 4737                      vml[c]->vdev_isl2cache ||
4421 4738                      !vdev_writeable(vml[c]) ||
4422 4739                      vml[c]->vdev_children != 0 ||
4423 4740                      vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
4424 4741                      c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
4425 4742                          error = EINVAL;
4426 4743                          break;
4427 4744                  }
4428 4745  
4429 4746                  if (vdev_dtl_required(vml[c])) {
4430 4747                          error = EBUSY;
4431 4748                          break;
4432 4749                  }
4433 4750  
4434 4751                  /* we need certain info from the top level */
4435 4752                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
4436 4753                      vml[c]->vdev_top->vdev_ms_array) == 0);
4437 4754                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
4438 4755                      vml[c]->vdev_top->vdev_ms_shift) == 0);
4439 4756                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
4440 4757                      vml[c]->vdev_top->vdev_asize) == 0);
4441 4758                  VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
4442 4759                      vml[c]->vdev_top->vdev_ashift) == 0);
4443 4760          }
4444 4761  
4445 4762          if (error != 0) {
4446 4763                  kmem_free(vml, children * sizeof (vdev_t *));
4447 4764                  kmem_free(glist, children * sizeof (uint64_t));
4448 4765                  return (spa_vdev_exit(spa, NULL, txg, error));
4449 4766          }
4450 4767  
4451 4768          /* stop writers from using the disks */
4452 4769          for (c = 0; c < children; c++) {
4453 4770                  if (vml[c] != NULL)
4454 4771                          vml[c]->vdev_offline = B_TRUE;
4455 4772          }
4456 4773          vdev_reopen(spa->spa_root_vdev);
4457 4774  
4458 4775          /*
4459 4776           * Temporarily record the splitting vdevs in the spa config.  This
4460 4777           * will disappear once the config is regenerated.
4461 4778           */
4462 4779          VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
4463 4780          VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
4464 4781              glist, children) == 0);
4465 4782          kmem_free(glist, children * sizeof (uint64_t));
4466 4783  
4467 4784          mutex_enter(&spa->spa_props_lock);
4468 4785          VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
4469 4786              nvl) == 0);
4470 4787          mutex_exit(&spa->spa_props_lock);
4471 4788          spa->spa_config_splitting = nvl;
4472 4789          vdev_config_dirty(spa->spa_root_vdev);
4473 4790  
4474 4791          /* configure and create the new pool */
4475 4792          VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
4476 4793          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
4477 4794              exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
4478 4795          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
4479 4796              spa_version(spa)) == 0);
4480 4797          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
4481 4798              spa->spa_config_txg) == 0);
4482 4799          VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
4483 4800              spa_generate_guid(NULL)) == 0);
4484 4801          (void) nvlist_lookup_string(props,
4485 4802              zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
4486 4803  
4487 4804          /* add the new pool to the namespace */
4488 4805          newspa = spa_add(newname, config, altroot);
4489 4806          newspa->spa_config_txg = spa->spa_config_txg;
4490 4807          spa_set_log_state(newspa, SPA_LOG_CLEAR);
4491 4808  
4492 4809          /* release the spa config lock, retaining the namespace lock */
4493 4810          spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4494 4811  
4495 4812          if (zio_injection_enabled)
4496 4813                  zio_handle_panic_injection(spa, FTAG, 1);
4497 4814  
4498 4815          spa_activate(newspa, spa_mode_global);
4499 4816          spa_async_suspend(newspa);
4500 4817  
4501 4818          /* create the new pool from the disks of the original pool */
4502 4819          error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
4503 4820          if (error)
4504 4821                  goto out;
4505 4822  
4506 4823          /* if that worked, generate a real config for the new pool */
4507 4824          if (newspa->spa_root_vdev != NULL) {
4508 4825                  VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
4509 4826                      NV_UNIQUE_NAME, KM_SLEEP) == 0);
4510 4827                  VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
4511 4828                      ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
4512 4829                  spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
4513 4830                      B_TRUE));
4514 4831          }
4515 4832  
4516 4833          /* set the props */
4517 4834          if (props != NULL) {
4518 4835                  spa_configfile_set(newspa, props, B_FALSE);
4519 4836                  error = spa_prop_set(newspa, props);
4520 4837                  if (error)
4521 4838                          goto out;
4522 4839          }
4523 4840  
4524 4841          /* flush everything */
4525 4842          txg = spa_vdev_config_enter(newspa);
4526 4843          vdev_config_dirty(newspa->spa_root_vdev);
4527 4844          (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
4528 4845  
4529 4846          if (zio_injection_enabled)
4530 4847                  zio_handle_panic_injection(spa, FTAG, 2);
4531 4848  
4532 4849          spa_async_resume(newspa);
4533 4850  
4534 4851          /* finally, update the original pool's config */
4535 4852          txg = spa_vdev_config_enter(spa);
4536 4853          tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
4537 4854          error = dmu_tx_assign(tx, TXG_WAIT);
4538 4855          if (error != 0)
4539 4856                  dmu_tx_abort(tx);
4540 4857          for (c = 0; c < children; c++) {
4541 4858                  if (vml[c] != NULL) {
4542 4859                          vdev_split(vml[c]);
4543 4860                          if (error == 0)
4544 4861                                  spa_history_log_internal(LOG_POOL_VDEV_DETACH,
4545 4862                                      spa, tx, "vdev=%s",
4546 4863                                      vml[c]->vdev_path);
4547 4864                          vdev_free(vml[c]);
4548 4865                  }
4549 4866          }
4550 4867          vdev_config_dirty(spa->spa_root_vdev);
4551 4868          spa->spa_config_splitting = NULL;
4552 4869          nvlist_free(nvl);
4553 4870          if (error == 0)
4554 4871                  dmu_tx_commit(tx);
4555 4872          (void) spa_vdev_exit(spa, NULL, txg, 0);
4556 4873  
4557 4874          if (zio_injection_enabled)
4558 4875                  zio_handle_panic_injection(spa, FTAG, 3);
4559 4876  
4560 4877          /* split is complete; log a history record */
4561 4878          spa_history_log_internal(LOG_POOL_SPLIT, newspa, NULL,
4562 4879              "split new pool %s from pool %s", newname, spa_name(spa));
4563 4880  
4564 4881          kmem_free(vml, children * sizeof (vdev_t *));
4565 4882  
4566 4883          /* if we're not going to mount the filesystems in userland, export */
4567 4884          if (exp)
4568 4885                  error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
4569 4886                      B_FALSE, B_FALSE);
4570 4887  
4571 4888          return (error);
4572 4889  
4573 4890  out:
4574 4891          spa_unload(newspa);
4575 4892          spa_deactivate(newspa);
4576 4893          spa_remove(newspa);
4577 4894  
4578 4895          txg = spa_vdev_config_enter(spa);
4579 4896  
4580 4897          /* re-online all offlined disks */
4581 4898          for (c = 0; c < children; c++) {
4582 4899                  if (vml[c] != NULL)
4583 4900                          vml[c]->vdev_offline = B_FALSE;
4584 4901          }
4585 4902          vdev_reopen(spa->spa_root_vdev);
4586 4903  
4587 4904          nvlist_free(spa->spa_config_splitting);
4588 4905          spa->spa_config_splitting = NULL;
4589 4906          (void) spa_vdev_exit(spa, NULL, txg, error);
4590 4907  
4591 4908          kmem_free(vml, children * sizeof (vdev_t *));
4592 4909          return (error);
4593 4910  }
4594 4911  
4595 4912  static nvlist_t *
4596 4913  spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
4597 4914  {
4598 4915          for (int i = 0; i < count; i++) {
4599 4916                  uint64_t guid;
4600 4917  
4601 4918                  VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
4602 4919                      &guid) == 0);
4603 4920  
4604 4921                  if (guid == target_guid)
4605 4922                          return (nvpp[i]);
4606 4923          }
4607 4924  
4608 4925          return (NULL);
4609 4926  }
4610 4927  
4611 4928  static void
4612 4929  spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
4613 4930          nvlist_t *dev_to_remove)
4614 4931  {
4615 4932          nvlist_t **newdev = NULL;
4616 4933  
4617 4934          if (count > 1)
4618 4935                  newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
4619 4936  
4620 4937          for (int i = 0, j = 0; i < count; i++) {
4621 4938                  if (dev[i] == dev_to_remove)
4622 4939                          continue;
4623 4940                  VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
4624 4941          }
4625 4942  
4626 4943          VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
4627 4944          VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
4628 4945  
4629 4946          for (int i = 0; i < count - 1; i++)
4630 4947                  nvlist_free(newdev[i]);
4631 4948  
4632 4949          if (count > 1)
4633 4950                  kmem_free(newdev, (count - 1) * sizeof (void *));
4634 4951  }
4635 4952  
4636 4953  /*
4637 4954   * Evacuate the device.
4638 4955   */
4639 4956  static int
4640 4957  spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
4641 4958  {
4642 4959          uint64_t txg;
4643 4960          int error = 0;
4644 4961  
4645 4962          ASSERT(MUTEX_HELD(&spa_namespace_lock));
4646 4963          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
4647 4964          ASSERT(vd == vd->vdev_top);
4648 4965  
4649 4966          /*
4650 4967           * Evacuate the device.  We don't hold the config lock as writer
4651 4968           * since we need to do I/O but we do keep the
4652 4969           * spa_namespace_lock held.  Once this completes the device
4653 4970           * should no longer have any blocks allocated on it.
4654 4971           */
4655 4972          if (vd->vdev_islog) {
4656 4973                  if (vd->vdev_stat.vs_alloc != 0)
4657 4974                          error = spa_offline_log(spa);
4658 4975          } else {
4659 4976                  error = ENOTSUP;
4660 4977          }
4661 4978  
4662 4979          if (error)
4663 4980                  return (error);
4664 4981  
4665 4982          /*
4666 4983           * The evacuation succeeded.  Remove any remaining MOS metadata
4667 4984           * associated with this vdev, and wait for these changes to sync.
4668 4985           */
4669 4986          ASSERT3U(vd->vdev_stat.vs_alloc, ==, 0);
4670 4987          txg = spa_vdev_config_enter(spa);
4671 4988          vd->vdev_removing = B_TRUE;
4672 4989          vdev_dirty(vd, 0, NULL, txg);
4673 4990          vdev_config_dirty(vd);
4674 4991          spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
4675 4992  
4676 4993          return (0);
4677 4994  }
4678 4995  
4679 4996  /*
4680 4997   * Complete the removal by cleaning up the namespace.
4681 4998   */
4682 4999  static void
4683 5000  spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
4684 5001  {
4685 5002          vdev_t *rvd = spa->spa_root_vdev;
4686 5003          uint64_t id = vd->vdev_id;
4687 5004          boolean_t last_vdev = (id == (rvd->vdev_children - 1));
4688 5005  
4689 5006          ASSERT(MUTEX_HELD(&spa_namespace_lock));
4690 5007          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
4691 5008          ASSERT(vd == vd->vdev_top);
4692 5009  
4693 5010          /*
4694 5011           * Only remove any devices which are empty.
4695 5012           */
4696 5013          if (vd->vdev_stat.vs_alloc != 0)
4697 5014                  return;
4698 5015  
4699 5016          (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
4700 5017  
4701 5018          if (list_link_active(&vd->vdev_state_dirty_node))
4702 5019                  vdev_state_clean(vd);
4703 5020          if (list_link_active(&vd->vdev_config_dirty_node))
4704 5021                  vdev_config_clean(vd);
4705 5022  
4706 5023          vdev_free(vd);
4707 5024  
4708 5025          if (last_vdev) {
4709 5026                  vdev_compact_children(rvd);
4710 5027          } else {
4711 5028                  vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
4712 5029                  vdev_add_child(rvd, vd);
4713 5030          }
4714 5031          vdev_config_dirty(rvd);
4715 5032  
4716 5033          /*
4717 5034           * Reassess the health of our root vdev.
4718 5035           */
4719 5036          vdev_reopen(rvd);
4720 5037  }
4721 5038  
4722 5039  /*
4723 5040   * Remove a device from the pool -
4724 5041   *
4725 5042   * Removing a device from the vdev namespace requires several steps
4726 5043   * and can take a significant amount of time.  As a result we use
4727 5044   * the spa_vdev_config_[enter/exit] functions which allow us to
4728 5045   * grab and release the spa_config_lock while still holding the namespace
4729 5046   * lock.  During each step the configuration is synced out.
4730 5047   */
4731 5048  
4732 5049  /*
4733 5050   * Remove a device from the pool.  Currently, this supports removing only hot
4734 5051   * spares, slogs, and level 2 ARC devices.
4735 5052   */
4736 5053  int
4737 5054  spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
4738 5055  {
4739 5056          vdev_t *vd;
4740 5057          metaslab_group_t *mg;
4741 5058          nvlist_t **spares, **l2cache, *nv;
4742 5059          uint64_t txg = 0;
4743 5060          uint_t nspares, nl2cache;
4744 5061          int error = 0;
4745 5062          boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
4746 5063  
4747 5064          ASSERT(spa_writeable(spa));
4748 5065  
4749 5066          if (!locked)
4750 5067                  txg = spa_vdev_enter(spa);
4751 5068  
4752 5069          vd = spa_lookup_by_guid(spa, guid, B_FALSE);
4753 5070  
4754 5071          if (spa->spa_spares.sav_vdevs != NULL &&
4755 5072              nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
4756 5073              ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
4757 5074              (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
4758 5075                  /*
4759 5076                   * Only remove the hot spare if it's not currently in use
4760 5077                   * in this pool.
4761 5078                   */
4762 5079                  if (vd == NULL || unspare) {
4763 5080                          spa_vdev_remove_aux(spa->spa_spares.sav_config,
4764 5081                              ZPOOL_CONFIG_SPARES, spares, nspares, nv);
4765 5082                          spa_load_spares(spa);
4766 5083                          spa->spa_spares.sav_sync = B_TRUE;
4767 5084                  } else {
4768 5085                          error = EBUSY;
4769 5086                  }
4770 5087          } else if (spa->spa_l2cache.sav_vdevs != NULL &&
4771 5088              nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
4772 5089              ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
4773 5090              (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
4774 5091                  /*
4775 5092                   * Cache devices can always be removed.
4776 5093                   */
4777 5094                  spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
4778 5095                      ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
4779 5096                  spa_load_l2cache(spa);
4780 5097                  spa->spa_l2cache.sav_sync = B_TRUE;
4781 5098          } else if (vd != NULL && vd->vdev_islog) {
4782 5099                  ASSERT(!locked);
4783 5100                  ASSERT(vd == vd->vdev_top);
4784 5101  
4785 5102                  /*
4786 5103                   * XXX - Once we have bp-rewrite this should
4787 5104                   * become the common case.
4788 5105                   */
4789 5106  
4790 5107                  mg = vd->vdev_mg;
4791 5108  
4792 5109                  /*
4793 5110                   * Stop allocating from this vdev.
4794 5111                   */
4795 5112                  metaslab_group_passivate(mg);
4796 5113  
4797 5114                  /*
4798 5115                   * Wait for the youngest allocations and frees to sync,
4799 5116                   * and then wait for the deferral of those frees to finish.
4800 5117                   */
4801 5118                  spa_vdev_config_exit(spa, NULL,
4802 5119                      txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
4803 5120  
4804 5121                  /*
4805 5122                   * Attempt to evacuate the vdev.
4806 5123                   */
4807 5124                  error = spa_vdev_remove_evacuate(spa, vd);
4808 5125  
4809 5126                  txg = spa_vdev_config_enter(spa);
4810 5127  
4811 5128                  /*
4812 5129                   * If we couldn't evacuate the vdev, unwind.
4813 5130                   */
4814 5131                  if (error) {
4815 5132                          metaslab_group_activate(mg);
4816 5133                          return (spa_vdev_exit(spa, NULL, txg, error));
4817 5134                  }
4818 5135  
4819 5136                  /*
4820 5137                   * Clean up the vdev namespace.
4821 5138                   */
4822 5139                  spa_vdev_remove_from_namespace(spa, vd);
4823 5140  
4824 5141          } else if (vd != NULL) {
4825 5142                  /*
4826 5143                   * Normal vdevs cannot be removed (yet).
4827 5144                   */
4828 5145                  error = ENOTSUP;
4829 5146          } else {
4830 5147                  /*
4831 5148                   * There is no vdev of any kind with the specified guid.
4832 5149                   */
4833 5150                  error = ENOENT;
4834 5151          }
4835 5152  
4836 5153          if (!locked)
4837 5154                  return (spa_vdev_exit(spa, NULL, txg, error));
4838 5155  
4839 5156          return (error);
4840 5157  }
4841 5158  
4842 5159  /*
4843 5160   * Find any device that's done replacing, or a vdev marked 'unspare' that's
4844 5161   * current spared, so we can detach it.
4845 5162   */
4846 5163  static vdev_t *
4847 5164  spa_vdev_resilver_done_hunt(vdev_t *vd)
4848 5165  {
4849 5166          vdev_t *newvd, *oldvd;
4850 5167  
4851 5168          for (int c = 0; c < vd->vdev_children; c++) {
4852 5169                  oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
4853 5170                  if (oldvd != NULL)
4854 5171                          return (oldvd);
4855 5172          }
4856 5173  
4857 5174          /*
4858 5175           * Check for a completed replacement.  We always consider the first
4859 5176           * vdev in the list to be the oldest vdev, and the last one to be
4860 5177           * the newest (see spa_vdev_attach() for how that works).  In
4861 5178           * the case where the newest vdev is faulted, we will not automatically
4862 5179           * remove it after a resilver completes.  This is OK as it will require
4863 5180           * user intervention to determine which disk the admin wishes to keep.
4864 5181           */
4865 5182          if (vd->vdev_ops == &vdev_replacing_ops) {
4866 5183                  ASSERT(vd->vdev_children > 1);
4867 5184  
4868 5185                  newvd = vd->vdev_child[vd->vdev_children - 1];
4869 5186                  oldvd = vd->vdev_child[0];
4870 5187  
4871 5188                  if (vdev_dtl_empty(newvd, DTL_MISSING) &&
4872 5189                      vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4873 5190                      !vdev_dtl_required(oldvd))
4874 5191                          return (oldvd);
4875 5192          }
4876 5193  
4877 5194          /*
4878 5195           * Check for a completed resilver with the 'unspare' flag set.
4879 5196           */
4880 5197          if (vd->vdev_ops == &vdev_spare_ops) {
4881 5198                  vdev_t *first = vd->vdev_child[0];
4882 5199                  vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
4883 5200  
4884 5201                  if (last->vdev_unspare) {
4885 5202                          oldvd = first;
4886 5203                          newvd = last;
4887 5204                  } else if (first->vdev_unspare) {
4888 5205                          oldvd = last;
4889 5206                          newvd = first;
4890 5207                  } else {
4891 5208                          oldvd = NULL;
4892 5209                  }
4893 5210  
4894 5211                  if (oldvd != NULL &&
4895 5212                      vdev_dtl_empty(newvd, DTL_MISSING) &&
4896 5213                      vdev_dtl_empty(newvd, DTL_OUTAGE) &&
4897 5214                      !vdev_dtl_required(oldvd))
4898 5215                          return (oldvd);
4899 5216  
4900 5217                  /*
4901 5218                   * If there are more than two spares attached to a disk,
4902 5219                   * and those spares are not required, then we want to
4903 5220                   * attempt to free them up now so that they can be used
4904 5221                   * by other pools.  Once we're back down to a single
4905 5222                   * disk+spare, we stop removing them.
4906 5223                   */
4907 5224                  if (vd->vdev_children > 2) {
4908 5225                          newvd = vd->vdev_child[1];
4909 5226  
4910 5227                          if (newvd->vdev_isspare && last->vdev_isspare &&
4911 5228                              vdev_dtl_empty(last, DTL_MISSING) &&
4912 5229                              vdev_dtl_empty(last, DTL_OUTAGE) &&
4913 5230                              !vdev_dtl_required(newvd))
4914 5231                                  return (newvd);
4915 5232                  }
4916 5233          }
4917 5234  
4918 5235          return (NULL);
4919 5236  }
4920 5237  
4921 5238  static void
4922 5239  spa_vdev_resilver_done(spa_t *spa)
4923 5240  {
4924 5241          vdev_t *vd, *pvd, *ppvd;
4925 5242          uint64_t guid, sguid, pguid, ppguid;
4926 5243  
4927 5244          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4928 5245  
4929 5246          while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
4930 5247                  pvd = vd->vdev_parent;
4931 5248                  ppvd = pvd->vdev_parent;
4932 5249                  guid = vd->vdev_guid;
4933 5250                  pguid = pvd->vdev_guid;
4934 5251                  ppguid = ppvd->vdev_guid;
4935 5252                  sguid = 0;
4936 5253                  /*
4937 5254                   * If we have just finished replacing a hot spared device, then
4938 5255                   * we need to detach the parent's first child (the original hot
4939 5256                   * spare) as well.
4940 5257                   */
4941 5258                  if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
4942 5259                      ppvd->vdev_children == 2) {
4943 5260                          ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
4944 5261                          sguid = ppvd->vdev_child[1]->vdev_guid;
4945 5262                  }
4946 5263                  spa_config_exit(spa, SCL_ALL, FTAG);
4947 5264                  if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
4948 5265                          return;
4949 5266                  if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
4950 5267                          return;
4951 5268                  spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
4952 5269          }
4953 5270  
4954 5271          spa_config_exit(spa, SCL_ALL, FTAG);
4955 5272  }
4956 5273  
4957 5274  /*
4958 5275   * Update the stored path or FRU for this vdev.
4959 5276   */
4960 5277  int
4961 5278  spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
4962 5279      boolean_t ispath)
4963 5280  {
4964 5281          vdev_t *vd;
4965 5282          boolean_t sync = B_FALSE;
4966 5283  
4967 5284          ASSERT(spa_writeable(spa));
4968 5285  
4969 5286          spa_vdev_state_enter(spa, SCL_ALL);
4970 5287  
4971 5288          if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
4972 5289                  return (spa_vdev_state_exit(spa, NULL, ENOENT));
4973 5290  
4974 5291          if (!vd->vdev_ops->vdev_op_leaf)
4975 5292                  return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
4976 5293  
4977 5294          if (ispath) {
4978 5295                  if (strcmp(value, vd->vdev_path) != 0) {
4979 5296                          spa_strfree(vd->vdev_path);
4980 5297                          vd->vdev_path = spa_strdup(value);
4981 5298                          sync = B_TRUE;
4982 5299                  }
4983 5300          } else {
4984 5301                  if (vd->vdev_fru == NULL) {
4985 5302                          vd->vdev_fru = spa_strdup(value);
4986 5303                          sync = B_TRUE;
4987 5304                  } else if (strcmp(value, vd->vdev_fru) != 0) {
4988 5305                          spa_strfree(vd->vdev_fru);
4989 5306                          vd->vdev_fru = spa_strdup(value);
4990 5307                          sync = B_TRUE;
4991 5308                  }
4992 5309          }
4993 5310  
4994 5311          return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
4995 5312  }
4996 5313  
4997 5314  int
4998 5315  spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
4999 5316  {
5000 5317          return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
5001 5318  }
5002 5319  
5003 5320  int
5004 5321  spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
5005 5322  {
5006 5323          return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
5007 5324  }
5008 5325  
5009 5326  /*
5010 5327   * ==========================================================================
5011 5328   * SPA Scanning
5012 5329   * ==========================================================================
5013 5330   */
5014 5331  
5015 5332  int
5016 5333  spa_scan_stop(spa_t *spa)
5017 5334  {
5018 5335          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5019 5336          if (dsl_scan_resilvering(spa->spa_dsl_pool))
5020 5337                  return (EBUSY);
5021 5338          return (dsl_scan_cancel(spa->spa_dsl_pool));
5022 5339  }
5023 5340  
5024 5341  int
5025 5342  spa_scan(spa_t *spa, pool_scan_func_t func)
5026 5343  {
5027 5344          ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
5028 5345  
5029 5346          if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
5030 5347                  return (ENOTSUP);
5031 5348  
5032 5349          /*
5033 5350           * If a resilver was requested, but there is no DTL on a
5034 5351           * writeable leaf device, we have nothing to do.
5035 5352           */
5036 5353          if (func == POOL_SCAN_RESILVER &&
5037 5354              !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
5038 5355                  spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
5039 5356                  return (0);
5040 5357          }
5041 5358  
5042 5359          return (dsl_scan(spa->spa_dsl_pool, func));
5043 5360  }
5044 5361  
5045 5362  /*
5046 5363   * ==========================================================================
5047 5364   * SPA async task processing
5048 5365   * ==========================================================================
5049 5366   */
5050 5367  
5051 5368  static void
5052 5369  spa_async_remove(spa_t *spa, vdev_t *vd)
5053 5370  {
5054 5371          if (vd->vdev_remove_wanted) {
5055 5372                  vd->vdev_remove_wanted = B_FALSE;
5056 5373                  vd->vdev_delayed_close = B_FALSE;
5057 5374                  vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
5058 5375  
5059 5376                  /*
5060 5377                   * We want to clear the stats, but we don't want to do a full
5061 5378                   * vdev_clear() as that will cause us to throw away
5062 5379                   * degraded/faulted state as well as attempt to reopen the
5063 5380                   * device, all of which is a waste.
5064 5381                   */
5065 5382                  vd->vdev_stat.vs_read_errors = 0;
5066 5383                  vd->vdev_stat.vs_write_errors = 0;
5067 5384                  vd->vdev_stat.vs_checksum_errors = 0;
5068 5385  
5069 5386                  vdev_state_dirty(vd->vdev_top);
5070 5387          }
5071 5388  
5072 5389          for (int c = 0; c < vd->vdev_children; c++)
5073 5390                  spa_async_remove(spa, vd->vdev_child[c]);
5074 5391  }
5075 5392  
5076 5393  static void
5077 5394  spa_async_probe(spa_t *spa, vdev_t *vd)
5078 5395  {
5079 5396          if (vd->vdev_probe_wanted) {
5080 5397                  vd->vdev_probe_wanted = B_FALSE;
5081 5398                  vdev_reopen(vd);        /* vdev_open() does the actual probe */
5082 5399          }
5083 5400  
5084 5401          for (int c = 0; c < vd->vdev_children; c++)
5085 5402                  spa_async_probe(spa, vd->vdev_child[c]);
5086 5403  }
5087 5404  
5088 5405  static void
5089 5406  spa_async_autoexpand(spa_t *spa, vdev_t *vd)
5090 5407  {
5091 5408          sysevent_id_t eid;
5092 5409          nvlist_t *attr;
5093 5410          char *physpath;
5094 5411  
5095 5412          if (!spa->spa_autoexpand)
5096 5413                  return;
5097 5414  
5098 5415          for (int c = 0; c < vd->vdev_children; c++) {
5099 5416                  vdev_t *cvd = vd->vdev_child[c];
5100 5417                  spa_async_autoexpand(spa, cvd);
5101 5418          }
5102 5419  
5103 5420          if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
5104 5421                  return;
5105 5422  
5106 5423          physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
5107 5424          (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
5108 5425  
5109 5426          VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5110 5427          VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
5111 5428  
5112 5429          (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
5113 5430              ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
5114 5431  
5115 5432          nvlist_free(attr);
5116 5433          kmem_free(physpath, MAXPATHLEN);
5117 5434  }
5118 5435  
5119 5436  static void
5120 5437  spa_async_thread(spa_t *spa)
5121 5438  {
5122 5439          int tasks;
5123 5440  
5124 5441          ASSERT(spa->spa_sync_on);
5125 5442  
5126 5443          mutex_enter(&spa->spa_async_lock);
5127 5444          tasks = spa->spa_async_tasks;
5128 5445          spa->spa_async_tasks = 0;
5129 5446          mutex_exit(&spa->spa_async_lock);
5130 5447  
5131 5448          /*
5132 5449           * See if the config needs to be updated.
5133 5450           */
5134 5451          if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
5135 5452                  uint64_t old_space, new_space;
5136 5453  
5137 5454                  mutex_enter(&spa_namespace_lock);
5138 5455                  old_space = metaslab_class_get_space(spa_normal_class(spa));
5139 5456                  spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
5140 5457                  new_space = metaslab_class_get_space(spa_normal_class(spa));
5141 5458                  mutex_exit(&spa_namespace_lock);
5142 5459  
5143 5460                  /*
5144 5461                   * If the pool grew as a result of the config update,
5145 5462                   * then log an internal history event.
5146 5463                   */
5147 5464                  if (new_space != old_space) {
5148 5465                          spa_history_log_internal(LOG_POOL_VDEV_ONLINE,
5149 5466                              spa, NULL,
5150 5467                              "pool '%s' size: %llu(+%llu)",
5151 5468                              spa_name(spa), new_space, new_space - old_space);
5152 5469                  }
5153 5470          }
5154 5471  
5155 5472          /*
5156 5473           * See if any devices need to be marked REMOVED.
5157 5474           */
5158 5475          if (tasks & SPA_ASYNC_REMOVE) {
5159 5476                  spa_vdev_state_enter(spa, SCL_NONE);
5160 5477                  spa_async_remove(spa, spa->spa_root_vdev);
5161 5478                  for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
5162 5479                          spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
5163 5480                  for (int i = 0; i < spa->spa_spares.sav_count; i++)
5164 5481                          spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
5165 5482                  (void) spa_vdev_state_exit(spa, NULL, 0);
5166 5483          }
5167 5484  
5168 5485          if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
5169 5486                  spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5170 5487                  spa_async_autoexpand(spa, spa->spa_root_vdev);
5171 5488                  spa_config_exit(spa, SCL_CONFIG, FTAG);
5172 5489          }
5173 5490  
5174 5491          /*
5175 5492           * See if any devices need to be probed.
5176 5493           */
5177 5494          if (tasks & SPA_ASYNC_PROBE) {
5178 5495                  spa_vdev_state_enter(spa, SCL_NONE);
5179 5496                  spa_async_probe(spa, spa->spa_root_vdev);
5180 5497                  (void) spa_vdev_state_exit(spa, NULL, 0);
5181 5498          }
5182 5499  
5183 5500          /*
5184 5501           * If any devices are done replacing, detach them.
5185 5502           */
5186 5503          if (tasks & SPA_ASYNC_RESILVER_DONE)
5187 5504                  spa_vdev_resilver_done(spa);
5188 5505  
5189 5506          /*
5190 5507           * Kick off a resilver.
5191 5508           */
5192 5509          if (tasks & SPA_ASYNC_RESILVER)
5193 5510                  dsl_resilver_restart(spa->spa_dsl_pool, 0);
5194 5511  
5195 5512          /*
5196 5513           * Let the world know that we're done.
5197 5514           */
5198 5515          mutex_enter(&spa->spa_async_lock);
5199 5516          spa->spa_async_thread = NULL;
5200 5517          cv_broadcast(&spa->spa_async_cv);
5201 5518          mutex_exit(&spa->spa_async_lock);
5202 5519          thread_exit();
5203 5520  }
5204 5521  
5205 5522  void
5206 5523  spa_async_suspend(spa_t *spa)
5207 5524  {
5208 5525          mutex_enter(&spa->spa_async_lock);
5209 5526          spa->spa_async_suspended++;
5210 5527          while (spa->spa_async_thread != NULL)
5211 5528                  cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
5212 5529          mutex_exit(&spa->spa_async_lock);
5213 5530  }
5214 5531  
5215 5532  void
5216 5533  spa_async_resume(spa_t *spa)
5217 5534  {
5218 5535          mutex_enter(&spa->spa_async_lock);
5219 5536          ASSERT(spa->spa_async_suspended != 0);
5220 5537          spa->spa_async_suspended--;
5221 5538          mutex_exit(&spa->spa_async_lock);
5222 5539  }
5223 5540  
5224 5541  static void
5225 5542  spa_async_dispatch(spa_t *spa)
5226 5543  {
5227 5544          mutex_enter(&spa->spa_async_lock);
5228 5545          if (spa->spa_async_tasks && !spa->spa_async_suspended &&
5229 5546              spa->spa_async_thread == NULL &&
5230 5547              rootdir != NULL && !vn_is_readonly(rootdir))
5231 5548                  spa->spa_async_thread = thread_create(NULL, 0,
5232 5549                      spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
5233 5550          mutex_exit(&spa->spa_async_lock);
5234 5551  }
5235 5552  
5236 5553  void
5237 5554  spa_async_request(spa_t *spa, int task)
5238 5555  {
5239 5556          zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
5240 5557          mutex_enter(&spa->spa_async_lock);
5241 5558          spa->spa_async_tasks |= task;
5242 5559          mutex_exit(&spa->spa_async_lock);
5243 5560  }
5244 5561  
5245 5562  /*
5246 5563   * ==========================================================================
5247 5564   * SPA syncing routines
5248 5565   * ==========================================================================
5249 5566   */
5250 5567  
5251 5568  static int
5252 5569  bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5253 5570  {
5254 5571          bpobj_t *bpo = arg;
5255 5572          bpobj_enqueue(bpo, bp, tx);
5256 5573          return (0);
5257 5574  }
5258 5575  
5259 5576  static int
5260 5577  spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
5261 5578  {
5262 5579          zio_t *zio = arg;
5263 5580  
5264 5581          zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
5265 5582              zio->io_flags));
5266 5583          return (0);
5267 5584  }
5268 5585  
5269 5586  static void
5270 5587  spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
5271 5588  {
5272 5589          char *packed = NULL;
5273 5590          size_t bufsize;
  
    | 
      ↓ open down ↓ | 
    1697 lines elided | 
    
      ↑ open up ↑ | 
  
5274 5591          size_t nvsize = 0;
5275 5592          dmu_buf_t *db;
5276 5593  
5277 5594          VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
5278 5595  
5279 5596          /*
5280 5597           * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
5281 5598           * information.  This avoids the dbuf_will_dirty() path and
5282 5599           * saves us a pre-read to get data we don't actually care about.
5283 5600           */
5284      -        bufsize = P2ROUNDUP(nvsize, SPA_CONFIG_BLOCKSIZE);
     5601 +        bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
5285 5602          packed = kmem_alloc(bufsize, KM_SLEEP);
5286 5603  
5287 5604          VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
5288 5605              KM_SLEEP) == 0);
5289 5606          bzero(packed + nvsize, bufsize - nvsize);
5290 5607  
5291 5608          dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
5292 5609  
5293 5610          kmem_free(packed, bufsize);
5294 5611  
5295 5612          VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
5296 5613          dmu_buf_will_dirty(db, tx);
5297 5614          *(uint64_t *)db->db_data = nvsize;
5298 5615          dmu_buf_rele(db, FTAG);
5299 5616  }
5300 5617  
5301 5618  static void
5302 5619  spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
5303 5620      const char *config, const char *entry)
5304 5621  {
5305 5622          nvlist_t *nvroot;
5306 5623          nvlist_t **list;
5307 5624          int i;
5308 5625  
5309 5626          if (!sav->sav_sync)
5310 5627                  return;
5311 5628  
5312 5629          /*
5313 5630           * Update the MOS nvlist describing the list of available devices.
5314 5631           * spa_validate_aux() will have already made sure this nvlist is
5315 5632           * valid and the vdevs are labeled appropriately.
5316 5633           */
5317 5634          if (sav->sav_object == 0) {
5318 5635                  sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
5319 5636                      DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
5320 5637                      sizeof (uint64_t), tx);
5321 5638                  VERIFY(zap_update(spa->spa_meta_objset,
5322 5639                      DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
5323 5640                      &sav->sav_object, tx) == 0);
5324 5641          }
5325 5642  
5326 5643          VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
5327 5644          if (sav->sav_count == 0) {
5328 5645                  VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
5329 5646          } else {
5330 5647                  list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
5331 5648                  for (i = 0; i < sav->sav_count; i++)
5332 5649                          list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
5333 5650                              B_FALSE, VDEV_CONFIG_L2CACHE);
5334 5651                  VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
5335 5652                      sav->sav_count) == 0);
5336 5653                  for (i = 0; i < sav->sav_count; i++)
5337 5654                          nvlist_free(list[i]);
5338 5655                  kmem_free(list, sav->sav_count * sizeof (void *));
5339 5656          }
5340 5657  
5341 5658          spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
5342 5659          nvlist_free(nvroot);
5343 5660  
5344 5661          sav->sav_sync = B_FALSE;
5345 5662  }
5346 5663  
5347 5664  static void
5348 5665  spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
5349 5666  {
5350 5667          nvlist_t *config;
5351 5668  
5352 5669          if (list_is_empty(&spa->spa_config_dirty_list))
5353 5670                  return;
5354 5671  
5355 5672          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5356 5673  
5357 5674          config = spa_config_generate(spa, spa->spa_root_vdev,
5358 5675              dmu_tx_get_txg(tx), B_FALSE);
  
    | 
      ↓ open down ↓ | 
    64 lines elided | 
    
      ↑ open up ↑ | 
  
5359 5676  
5360 5677          spa_config_exit(spa, SCL_STATE, FTAG);
5361 5678  
5362 5679          if (spa->spa_config_syncing)
5363 5680                  nvlist_free(spa->spa_config_syncing);
5364 5681          spa->spa_config_syncing = config;
5365 5682  
5366 5683          spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
5367 5684  }
5368 5685  
     5686 +static void
     5687 +spa_sync_version(void *arg1, void *arg2, dmu_tx_t *tx)
     5688 +{
     5689 +        spa_t *spa = arg1;
     5690 +        uint64_t version = *(uint64_t *)arg2;
     5691 +
     5692 +        /*
     5693 +         * Setting the version is special cased when first creating the pool.
     5694 +         */
     5695 +        ASSERT(tx->tx_txg != TXG_INITIAL);
     5696 +
     5697 +        ASSERT(version <= SPA_VERSION);
     5698 +        ASSERT(version >= spa_version(spa));
     5699 +
     5700 +        spa->spa_uberblock.ub_version = version;
     5701 +        vdev_config_dirty(spa->spa_root_vdev);
     5702 +}
     5703 +
5369 5704  /*
5370 5705   * Set zpool properties.
5371 5706   */
5372 5707  static void
5373 5708  spa_sync_props(void *arg1, void *arg2, dmu_tx_t *tx)
5374 5709  {
5375 5710          spa_t *spa = arg1;
5376 5711          objset_t *mos = spa->spa_meta_objset;
5377 5712          nvlist_t *nvp = arg2;
5378      -        nvpair_t *elem;
5379      -        uint64_t intval;
5380      -        char *strval;
5381      -        zpool_prop_t prop;
5382      -        const char *propname;
5383      -        zprop_type_t proptype;
     5713 +        nvpair_t *elem = NULL;
5384 5714  
5385 5715          mutex_enter(&spa->spa_props_lock);
5386 5716  
5387      -        elem = NULL;
5388 5717          while ((elem = nvlist_next_nvpair(nvp, elem))) {
     5718 +                uint64_t intval;
     5719 +                char *strval, *fname;
     5720 +                zpool_prop_t prop;
     5721 +                const char *propname;
     5722 +                zprop_type_t proptype;
     5723 +                zfeature_info_t *feature;
     5724 +
5389 5725                  switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
     5726 +                case ZPROP_INVAL:
     5727 +                        /*
     5728 +                         * We checked this earlier in spa_prop_validate().
     5729 +                         */
     5730 +                        ASSERT(zpool_prop_feature(nvpair_name(elem)));
     5731 +
     5732 +                        fname = strchr(nvpair_name(elem), '@') + 1;
     5733 +                        VERIFY3U(0, ==, zfeature_lookup_name(fname, &feature));
     5734 +
     5735 +                        spa_feature_enable(spa, feature, tx);
     5736 +                        break;
     5737 +
5390 5738                  case ZPOOL_PROP_VERSION:
     5739 +                        VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5391 5740                          /*
5392      -                         * Only set version for non-zpool-creation cases
5393      -                         * (set/import). spa_create() needs special care
5394      -                         * for version setting.
     5741 +                         * The version is synced seperatly before other
     5742 +                         * properties and should be correct by now.
5395 5743                           */
5396      -                        if (tx->tx_txg != TXG_INITIAL) {
5397      -                                VERIFY(nvpair_value_uint64(elem,
5398      -                                    &intval) == 0);
5399      -                                ASSERT(intval <= SPA_VERSION);
5400      -                                ASSERT(intval >= spa_version(spa));
5401      -                                spa->spa_uberblock.ub_version = intval;
5402      -                                vdev_config_dirty(spa->spa_root_vdev);
5403      -                        }
     5744 +                        ASSERT3U(spa_version(spa), >=, intval);
5404 5745                          break;
5405 5746  
5406 5747                  case ZPOOL_PROP_ALTROOT:
5407 5748                          /*
5408 5749                           * 'altroot' is a non-persistent property. It should
5409 5750                           * have been set temporarily at creation or import time.
5410 5751                           */
5411 5752                          ASSERT(spa->spa_root != NULL);
5412 5753                          break;
5413 5754  
5414 5755                  case ZPOOL_PROP_READONLY:
5415 5756                  case ZPOOL_PROP_CACHEFILE:
5416 5757                          /*
5417 5758                           * 'readonly' and 'cachefile' are also non-persisitent
5418 5759                           * properties.
5419 5760                           */
5420 5761                          break;
5421 5762                  case ZPOOL_PROP_COMMENT:
5422 5763                          VERIFY(nvpair_value_string(elem, &strval) == 0);
5423 5764                          if (spa->spa_comment != NULL)
5424 5765                                  spa_strfree(spa->spa_comment);
5425 5766                          spa->spa_comment = spa_strdup(strval);
5426 5767                          /*
5427 5768                           * We need to dirty the configuration on all the vdevs
5428 5769                           * so that their labels get updated.  It's unnecessary
5429 5770                           * to do this for pool creation since the vdev's
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
5430 5771                           * configuratoin has already been dirtied.
5431 5772                           */
5432 5773                          if (tx->tx_txg != TXG_INITIAL)
5433 5774                                  vdev_config_dirty(spa->spa_root_vdev);
5434 5775                          break;
5435 5776                  default:
5436 5777                          /*
5437 5778                           * Set pool property values in the poolprops mos object.
5438 5779                           */
5439 5780                          if (spa->spa_pool_props_object == 0) {
5440      -                                VERIFY((spa->spa_pool_props_object =
5441      -                                    zap_create(mos, DMU_OT_POOL_PROPS,
5442      -                                    DMU_OT_NONE, 0, tx)) > 0);
5443      -
5444      -                                VERIFY(zap_update(mos,
     5781 +                                spa->spa_pool_props_object =
     5782 +                                    zap_create_link(mos, DMU_OT_POOL_PROPS,
5445 5783                                      DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
5446      -                                    8, 1, &spa->spa_pool_props_object, tx)
5447      -                                    == 0);
     5784 +                                    tx);
5448 5785                          }
5449 5786  
5450 5787                          /* normalize the property name */
5451 5788                          propname = zpool_prop_to_name(prop);
5452 5789                          proptype = zpool_prop_get_type(prop);
5453 5790  
5454 5791                          if (nvpair_type(elem) == DATA_TYPE_STRING) {
5455 5792                                  ASSERT(proptype == PROP_TYPE_STRING);
5456 5793                                  VERIFY(nvpair_value_string(elem, &strval) == 0);
5457 5794                                  VERIFY(zap_update(mos,
5458 5795                                      spa->spa_pool_props_object, propname,
5459 5796                                      1, strlen(strval) + 1, strval, tx) == 0);
5460 5797  
5461 5798                          } else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
5462 5799                                  VERIFY(nvpair_value_uint64(elem, &intval) == 0);
5463 5800  
5464 5801                                  if (proptype == PROP_TYPE_INDEX) {
5465 5802                                          const char *unused;
5466 5803                                          VERIFY(zpool_prop_index_to_string(
5467 5804                                              prop, intval, &unused) == 0);
5468 5805                                  }
5469 5806                                  VERIFY(zap_update(mos,
5470 5807                                      spa->spa_pool_props_object, propname,
5471 5808                                      8, 1, &intval, tx) == 0);
5472 5809                          } else {
5473 5810                                  ASSERT(0); /* not allowed */
5474 5811                          }
5475 5812  
5476 5813                          switch (prop) {
5477 5814                          case ZPOOL_PROP_DELEGATION:
5478 5815                                  spa->spa_delegation = intval;
5479 5816                                  break;
5480 5817                          case ZPOOL_PROP_BOOTFS:
5481 5818                                  spa->spa_bootfs = intval;
5482 5819                                  break;
5483 5820                          case ZPOOL_PROP_FAILUREMODE:
5484 5821                                  spa->spa_failmode = intval;
5485 5822                                  break;
5486 5823                          case ZPOOL_PROP_AUTOEXPAND:
5487 5824                                  spa->spa_autoexpand = intval;
5488 5825                                  if (tx->tx_txg != TXG_INITIAL)
5489 5826                                          spa_async_request(spa,
5490 5827                                              SPA_ASYNC_AUTOEXPAND);
5491 5828                                  break;
5492 5829                          case ZPOOL_PROP_DEDUPDITTO:
5493 5830                                  spa->spa_dedup_ditto = intval;
5494 5831                                  break;
5495 5832                          default:
5496 5833                                  break;
5497 5834                          }
5498 5835                  }
5499 5836  
5500 5837                  /* log internal history if this is not a zpool create */
5501 5838                  if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY &&
5502 5839                      tx->tx_txg != TXG_INITIAL) {
5503 5840                          spa_history_log_internal(LOG_POOL_PROPSET,
5504 5841                              spa, tx, "%s %lld %s",
5505 5842                              nvpair_name(elem), intval, spa_name(spa));
5506 5843                  }
5507 5844          }
5508 5845  
5509 5846          mutex_exit(&spa->spa_props_lock);
5510 5847  }
5511 5848  
5512 5849  /*
5513 5850   * Perform one-time upgrade on-disk changes.  spa_version() does not
5514 5851   * reflect the new version this txg, so there must be no changes this
5515 5852   * txg to anything that the upgrade code depends on after it executes.
5516 5853   * Therefore this must be called after dsl_pool_sync() does the sync
5517 5854   * tasks.
5518 5855   */
5519 5856  static void
5520 5857  spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
5521 5858  {
5522 5859          dsl_pool_t *dp = spa->spa_dsl_pool;
5523 5860  
5524 5861          ASSERT(spa->spa_sync_pass == 1);
5525 5862  
5526 5863          if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
5527 5864              spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
5528 5865                  dsl_pool_create_origin(dp, tx);
5529 5866  
5530 5867                  /* Keeping the origin open increases spa_minref */
5531 5868                  spa->spa_minref += 3;
5532 5869          }
5533 5870  
5534 5871          if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
5535 5872              spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
  
    | 
      ↓ open down ↓ | 
    78 lines elided | 
    
      ↑ open up ↑ | 
  
5536 5873                  dsl_pool_upgrade_clones(dp, tx);
5537 5874          }
5538 5875  
5539 5876          if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
5540 5877              spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
5541 5878                  dsl_pool_upgrade_dir_clones(dp, tx);
5542 5879  
5543 5880                  /* Keeping the freedir open increases spa_minref */
5544 5881                  spa->spa_minref += 3;
5545 5882          }
     5883 +
     5884 +        if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
     5885 +            spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
     5886 +                spa_feature_create_zap_objects(spa, tx);
     5887 +        }
5546 5888  }
5547 5889  
5548 5890  /*
5549 5891   * Sync the specified transaction group.  New blocks may be dirtied as
5550 5892   * part of the process, so we iterate until it converges.
5551 5893   */
5552 5894  void
5553 5895  spa_sync(spa_t *spa, uint64_t txg)
5554 5896  {
5555 5897          dsl_pool_t *dp = spa->spa_dsl_pool;
5556 5898          objset_t *mos = spa->spa_meta_objset;
5557 5899          bpobj_t *defer_bpo = &spa->spa_deferred_bpobj;
5558 5900          bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
5559 5901          vdev_t *rvd = spa->spa_root_vdev;
5560 5902          vdev_t *vd;
5561 5903          dmu_tx_t *tx;
5562 5904          int error;
5563 5905  
5564 5906          VERIFY(spa_writeable(spa));
5565 5907  
5566 5908          /*
5567 5909           * Lock out configuration changes.
5568 5910           */
5569 5911          spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
5570 5912  
5571 5913          spa->spa_syncing_txg = txg;
5572 5914          spa->spa_sync_pass = 0;
5573 5915  
5574 5916          /*
5575 5917           * If there are any pending vdev state changes, convert them
5576 5918           * into config changes that go out with this transaction group.
5577 5919           */
5578 5920          spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5579 5921          while (list_head(&spa->spa_state_dirty_list) != NULL) {
5580 5922                  /*
5581 5923                   * We need the write lock here because, for aux vdevs,
5582 5924                   * calling vdev_config_dirty() modifies sav_config.
5583 5925                   * This is ugly and will become unnecessary when we
5584 5926                   * eliminate the aux vdev wart by integrating all vdevs
5585 5927                   * into the root vdev tree.
5586 5928                   */
5587 5929                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5588 5930                  spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
5589 5931                  while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
5590 5932                          vdev_state_clean(vd);
5591 5933                          vdev_config_dirty(vd);
5592 5934                  }
5593 5935                  spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
5594 5936                  spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
5595 5937          }
5596 5938          spa_config_exit(spa, SCL_STATE, FTAG);
5597 5939  
5598 5940          tx = dmu_tx_create_assigned(dp, txg);
5599 5941  
5600 5942          /*
5601 5943           * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
5602 5944           * set spa_deflate if we have no raid-z vdevs.
5603 5945           */
5604 5946          if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
5605 5947              spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
5606 5948                  int i;
5607 5949  
5608 5950                  for (i = 0; i < rvd->vdev_children; i++) {
5609 5951                          vd = rvd->vdev_child[i];
5610 5952                          if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
5611 5953                                  break;
5612 5954                  }
5613 5955                  if (i == rvd->vdev_children) {
5614 5956                          spa->spa_deflate = TRUE;
5615 5957                          VERIFY(0 == zap_add(spa->spa_meta_objset,
5616 5958                              DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
5617 5959                              sizeof (uint64_t), 1, &spa->spa_deflate, tx));
5618 5960                  }
5619 5961          }
5620 5962  
5621 5963          /*
5622 5964           * If anything has changed in this txg, or if someone is waiting
5623 5965           * for this txg to sync (eg, spa_vdev_remove()), push the
5624 5966           * deferred frees from the previous txg.  If not, leave them
5625 5967           * alone so that we don't generate work on an otherwise idle
5626 5968           * system.
5627 5969           */
5628 5970          if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
5629 5971              !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
5630 5972              !txg_list_empty(&dp->dp_sync_tasks, txg) ||
5631 5973              ((dsl_scan_active(dp->dp_scan) ||
5632 5974              txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
5633 5975                  zio_t *zio = zio_root(spa, NULL, NULL, 0);
5634 5976                  VERIFY3U(bpobj_iterate(defer_bpo,
5635 5977                      spa_free_sync_cb, zio, tx), ==, 0);
5636 5978                  VERIFY3U(zio_wait(zio), ==, 0);
5637 5979          }
5638 5980  
5639 5981          /*
5640 5982           * Iterate to convergence.
5641 5983           */
5642 5984          do {
5643 5985                  int pass = ++spa->spa_sync_pass;
5644 5986  
5645 5987                  spa_sync_config_object(spa, tx);
5646 5988                  spa_sync_aux_dev(spa, &spa->spa_spares, tx,
5647 5989                      ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
5648 5990                  spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
5649 5991                      ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
5650 5992                  spa_errlog_sync(spa, txg);
5651 5993                  dsl_pool_sync(dp, txg);
5652 5994  
5653 5995                  if (pass <= SYNC_PASS_DEFERRED_FREE) {
5654 5996                          zio_t *zio = zio_root(spa, NULL, NULL, 0);
5655 5997                          bplist_iterate(free_bpl, spa_free_sync_cb,
5656 5998                              zio, tx);
5657 5999                          VERIFY(zio_wait(zio) == 0);
5658 6000                  } else {
5659 6001                          bplist_iterate(free_bpl, bpobj_enqueue_cb,
5660 6002                              defer_bpo, tx);
5661 6003                  }
5662 6004  
5663 6005                  ddt_sync(spa, txg);
5664 6006                  dsl_scan_sync(dp, tx);
5665 6007  
5666 6008                  while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
5667 6009                          vdev_sync(vd, txg);
5668 6010  
5669 6011                  if (pass == 1)
5670 6012                          spa_sync_upgrades(spa, tx);
5671 6013  
5672 6014          } while (dmu_objset_is_dirty(mos, txg));
5673 6015  
5674 6016          /*
5675 6017           * Rewrite the vdev configuration (which includes the uberblock)
5676 6018           * to commit the transaction group.
5677 6019           *
5678 6020           * If there are no dirty vdevs, we sync the uberblock to a few
5679 6021           * random top-level vdevs that are known to be visible in the
5680 6022           * config cache (see spa_vdev_add() for a complete description).
5681 6023           * If there *are* dirty vdevs, sync the uberblock to all vdevs.
5682 6024           */
5683 6025          for (;;) {
5684 6026                  /*
5685 6027                   * We hold SCL_STATE to prevent vdev open/close/etc.
5686 6028                   * while we're attempting to write the vdev labels.
5687 6029                   */
5688 6030                  spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
5689 6031  
5690 6032                  if (list_is_empty(&spa->spa_config_dirty_list)) {
5691 6033                          vdev_t *svd[SPA_DVAS_PER_BP];
5692 6034                          int svdcount = 0;
5693 6035                          int children = rvd->vdev_children;
5694 6036                          int c0 = spa_get_random(children);
5695 6037  
5696 6038                          for (int c = 0; c < children; c++) {
5697 6039                                  vd = rvd->vdev_child[(c0 + c) % children];
5698 6040                                  if (vd->vdev_ms_array == 0 || vd->vdev_islog)
5699 6041                                          continue;
5700 6042                                  svd[svdcount++] = vd;
5701 6043                                  if (svdcount == SPA_DVAS_PER_BP)
5702 6044                                          break;
5703 6045                          }
5704 6046                          error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
5705 6047                          if (error != 0)
5706 6048                                  error = vdev_config_sync(svd, svdcount, txg,
5707 6049                                      B_TRUE);
5708 6050                  } else {
5709 6051                          error = vdev_config_sync(rvd->vdev_child,
5710 6052                              rvd->vdev_children, txg, B_FALSE);
5711 6053                          if (error != 0)
5712 6054                                  error = vdev_config_sync(rvd->vdev_child,
5713 6055                                      rvd->vdev_children, txg, B_TRUE);
5714 6056                  }
5715 6057  
5716 6058                  spa_config_exit(spa, SCL_STATE, FTAG);
5717 6059  
5718 6060                  if (error == 0)
5719 6061                          break;
5720 6062                  zio_suspend(spa, NULL);
5721 6063                  zio_resume_wait(spa);
5722 6064          }
5723 6065          dmu_tx_commit(tx);
5724 6066  
5725 6067          /*
5726 6068           * Clear the dirty config list.
5727 6069           */
5728 6070          while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
5729 6071                  vdev_config_clean(vd);
5730 6072  
5731 6073          /*
5732 6074           * Now that the new config has synced transactionally,
5733 6075           * let it become visible to the config cache.
5734 6076           */
5735 6077          if (spa->spa_config_syncing != NULL) {
5736 6078                  spa_config_set(spa, spa->spa_config_syncing);
5737 6079                  spa->spa_config_txg = txg;
5738 6080                  spa->spa_config_syncing = NULL;
5739 6081          }
5740 6082  
5741 6083          spa->spa_ubsync = spa->spa_uberblock;
5742 6084  
5743 6085          dsl_pool_sync_done(dp, txg);
5744 6086  
5745 6087          /*
5746 6088           * Update usable space statistics.
5747 6089           */
5748 6090          while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
5749 6091                  vdev_sync_done(vd, txg);
5750 6092  
5751 6093          spa_update_dspace(spa);
5752 6094  
5753 6095          /*
5754 6096           * It had better be the case that we didn't dirty anything
5755 6097           * since vdev_config_sync().
5756 6098           */
5757 6099          ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
5758 6100          ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
5759 6101          ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
5760 6102  
5761 6103          spa->spa_sync_pass = 0;
5762 6104  
5763 6105          spa_config_exit(spa, SCL_CONFIG, FTAG);
5764 6106  
5765 6107          spa_handle_ignored_writes(spa);
5766 6108  
5767 6109          /*
5768 6110           * If any async tasks have been requested, kick them off.
5769 6111           */
5770 6112          spa_async_dispatch(spa);
5771 6113  }
5772 6114  
5773 6115  /*
5774 6116   * Sync all pools.  We don't want to hold the namespace lock across these
5775 6117   * operations, so we take a reference on the spa_t and drop the lock during the
5776 6118   * sync.
5777 6119   */
5778 6120  void
5779 6121  spa_sync_allpools(void)
5780 6122  {
5781 6123          spa_t *spa = NULL;
5782 6124          mutex_enter(&spa_namespace_lock);
5783 6125          while ((spa = spa_next(spa)) != NULL) {
5784 6126                  if (spa_state(spa) != POOL_STATE_ACTIVE ||
5785 6127                      !spa_writeable(spa) || spa_suspended(spa))
5786 6128                          continue;
5787 6129                  spa_open_ref(spa, FTAG);
5788 6130                  mutex_exit(&spa_namespace_lock);
5789 6131                  txg_wait_synced(spa_get_dsl(spa), 0);
5790 6132                  mutex_enter(&spa_namespace_lock);
5791 6133                  spa_close(spa, FTAG);
5792 6134          }
5793 6135          mutex_exit(&spa_namespace_lock);
5794 6136  }
5795 6137  
5796 6138  /*
5797 6139   * ==========================================================================
5798 6140   * Miscellaneous routines
5799 6141   * ==========================================================================
5800 6142   */
5801 6143  
5802 6144  /*
5803 6145   * Remove all pools in the system.
5804 6146   */
5805 6147  void
5806 6148  spa_evict_all(void)
5807 6149  {
5808 6150          spa_t *spa;
5809 6151  
5810 6152          /*
5811 6153           * Remove all cached state.  All pools should be closed now,
5812 6154           * so every spa in the AVL tree should be unreferenced.
5813 6155           */
5814 6156          mutex_enter(&spa_namespace_lock);
5815 6157          while ((spa = spa_next(NULL)) != NULL) {
5816 6158                  /*
5817 6159                   * Stop async tasks.  The async thread may need to detach
5818 6160                   * a device that's been replaced, which requires grabbing
5819 6161                   * spa_namespace_lock, so we must drop it here.
5820 6162                   */
5821 6163                  spa_open_ref(spa, FTAG);
5822 6164                  mutex_exit(&spa_namespace_lock);
5823 6165                  spa_async_suspend(spa);
5824 6166                  mutex_enter(&spa_namespace_lock);
5825 6167                  spa_close(spa, FTAG);
5826 6168  
5827 6169                  if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
5828 6170                          spa_unload(spa);
5829 6171                          spa_deactivate(spa);
5830 6172                  }
5831 6173                  spa_remove(spa);
5832 6174          }
5833 6175          mutex_exit(&spa_namespace_lock);
5834 6176  }
5835 6177  
5836 6178  vdev_t *
5837 6179  spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
5838 6180  {
5839 6181          vdev_t *vd;
5840 6182          int i;
5841 6183  
5842 6184          if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
5843 6185                  return (vd);
5844 6186  
5845 6187          if (aux) {
5846 6188                  for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
5847 6189                          vd = spa->spa_l2cache.sav_vdevs[i];
5848 6190                          if (vd->vdev_guid == guid)
5849 6191                                  return (vd);
5850 6192                  }
5851 6193  
5852 6194                  for (i = 0; i < spa->spa_spares.sav_count; i++) {
5853 6195                          vd = spa->spa_spares.sav_vdevs[i];
5854 6196                          if (vd->vdev_guid == guid)
5855 6197                                  return (vd);
5856 6198                  }
5857 6199          }
5858 6200  
5859 6201          return (NULL);
5860 6202  }
5861 6203  
5862 6204  void
5863 6205  spa_upgrade(spa_t *spa, uint64_t version)
5864 6206  {
5865 6207          ASSERT(spa_writeable(spa));
5866 6208  
5867 6209          spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
5868 6210  
5869 6211          /*
5870 6212           * This should only be called for a non-faulted pool, and since a
5871 6213           * future version would result in an unopenable pool, this shouldn't be
5872 6214           * possible.
5873 6215           */
5874 6216          ASSERT(spa->spa_uberblock.ub_version <= SPA_VERSION);
5875 6217          ASSERT(version >= spa->spa_uberblock.ub_version);
5876 6218  
5877 6219          spa->spa_uberblock.ub_version = version;
5878 6220          vdev_config_dirty(spa->spa_root_vdev);
5879 6221  
5880 6222          spa_config_exit(spa, SCL_ALL, FTAG);
5881 6223  
5882 6224          txg_wait_synced(spa_get_dsl(spa), 0);
5883 6225  }
5884 6226  
5885 6227  boolean_t
5886 6228  spa_has_spare(spa_t *spa, uint64_t guid)
5887 6229  {
5888 6230          int i;
5889 6231          uint64_t spareguid;
5890 6232          spa_aux_vdev_t *sav = &spa->spa_spares;
5891 6233  
5892 6234          for (i = 0; i < sav->sav_count; i++)
5893 6235                  if (sav->sav_vdevs[i]->vdev_guid == guid)
5894 6236                          return (B_TRUE);
5895 6237  
5896 6238          for (i = 0; i < sav->sav_npending; i++) {
5897 6239                  if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
5898 6240                      &spareguid) == 0 && spareguid == guid)
5899 6241                          return (B_TRUE);
5900 6242          }
5901 6243  
5902 6244          return (B_FALSE);
5903 6245  }
5904 6246  
5905 6247  /*
5906 6248   * Check if a pool has an active shared spare device.
5907 6249   * Note: reference count of an active spare is 2, as a spare and as a replace
5908 6250   */
5909 6251  static boolean_t
5910 6252  spa_has_active_shared_spare(spa_t *spa)
5911 6253  {
5912 6254          int i, refcnt;
5913 6255          uint64_t pool;
5914 6256          spa_aux_vdev_t *sav = &spa->spa_spares;
5915 6257  
5916 6258          for (i = 0; i < sav->sav_count; i++) {
5917 6259                  if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
5918 6260                      &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
5919 6261                      refcnt > 2)
5920 6262                          return (B_TRUE);
5921 6263          }
5922 6264  
5923 6265          return (B_FALSE);
5924 6266  }
5925 6267  
5926 6268  /*
5927 6269   * Post a sysevent corresponding to the given event.  The 'name' must be one of
5928 6270   * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
5929 6271   * filled in from the spa and (optionally) the vdev.  This doesn't do anything
5930 6272   * in the userland libzpool, as we don't want consumers to misinterpret ztest
5931 6273   * or zdb as real changes.
5932 6274   */
5933 6275  void
5934 6276  spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
5935 6277  {
5936 6278  #ifdef _KERNEL
5937 6279          sysevent_t              *ev;
5938 6280          sysevent_attr_list_t    *attr = NULL;
5939 6281          sysevent_value_t        value;
5940 6282          sysevent_id_t           eid;
5941 6283  
5942 6284          ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
5943 6285              SE_SLEEP);
5944 6286  
5945 6287          value.value_type = SE_DATA_TYPE_STRING;
5946 6288          value.value.sv_string = spa_name(spa);
5947 6289          if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
5948 6290                  goto done;
5949 6291  
5950 6292          value.value_type = SE_DATA_TYPE_UINT64;
5951 6293          value.value.sv_uint64 = spa_guid(spa);
5952 6294          if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
5953 6295                  goto done;
5954 6296  
5955 6297          if (vd) {
5956 6298                  value.value_type = SE_DATA_TYPE_UINT64;
5957 6299                  value.value.sv_uint64 = vd->vdev_guid;
5958 6300                  if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
5959 6301                      SE_SLEEP) != 0)
5960 6302                          goto done;
5961 6303  
5962 6304                  if (vd->vdev_path) {
5963 6305                          value.value_type = SE_DATA_TYPE_STRING;
5964 6306                          value.value.sv_string = vd->vdev_path;
5965 6307                          if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
5966 6308                              &value, SE_SLEEP) != 0)
5967 6309                                  goto done;
5968 6310                  }
5969 6311          }
5970 6312  
5971 6313          if (sysevent_attach_attributes(ev, attr) != 0)
5972 6314                  goto done;
5973 6315          attr = NULL;
5974 6316  
5975 6317          (void) log_sysevent(ev, SE_SLEEP, &eid);
5976 6318  
5977 6319  done:
5978 6320          if (attr)
5979 6321                  sysevent_free_attr(attr);
5980 6322          sysevent_free(ev);
5981 6323  #endif
5982 6324  }
  
    | 
      ↓ open down ↓ | 
    427 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX