1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
  25  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  26  * Copyright 2016 Igor Kozhukhov <ikozhukhov@gmail.com>.
  27  */
  28 
  29 /*
  30  * Functions to convert between a list of vdevs and an nvlist representing the
  31  * configuration.  Each entry in the list can be one of:
  32  *
  33  *      Device vdevs
  34  *              disk=(path=..., devid=...)
  35  *              file=(path=...)
  36  *
  37  *      Group vdevs
  38  *              raidz[1|2]=(...)
  39  *              mirror=(...)
  40  *
  41  *      Hot spares
  42  *
  43  * While the underlying implementation supports it, group vdevs cannot contain
  44  * other group vdevs.  All userland verification of devices is contained within
  45  * this file.  If successful, the nvlist returned can be passed directly to the
  46  * kernel; we've done as much verification as possible in userland.
  47  *
  48  * Hot spares are a special case, and passed down as an array of disk vdevs, at
  49  * the same level as the root of the vdev tree.
  50  *
  51  * The only function exported by this file is 'make_root_vdev'.  The
  52  * function performs several passes:
  53  *
  54  *      1. Construct the vdev specification.  Performs syntax validation and
  55  *         makes sure each device is valid.
  56  *      2. Check for devices in use.  Using libdiskmgt, makes sure that no
  57  *         devices are also in use.  Some can be overridden using the 'force'
  58  *         flag, others cannot.
  59  *      3. Check for replication errors if the 'force' flag is not specified.
  60  *         validates that the replication level is consistent across the
  61  *         entire pool.
  62  *      4. Call libzfs to label any whole disks with an EFI label.
  63  */
  64 
  65 #include <assert.h>
  66 #include <devid.h>
  67 #include <errno.h>
  68 #include <fcntl.h>
  69 #include <libdiskmgt.h>
  70 #include <libintl.h>
  71 #include <libnvpair.h>
  72 #include <limits.h>
  73 #include <stdio.h>
  74 #include <string.h>
  75 #include <unistd.h>
  76 #include <sys/efi_partition.h>
  77 #include <sys/stat.h>
  78 #include <sys/vtoc.h>
  79 #include <sys/mntent.h>
  80 
  81 #include "zpool_util.h"
  82 
  83 #define BACKUP_SLICE    "s2"
  84 
  85 /*
  86  * For any given vdev specification, we can have multiple errors.  The
  87  * vdev_error() function keeps track of whether we have seen an error yet, and
  88  * prints out a header if its the first error we've seen.
  89  */
  90 boolean_t error_seen;
  91 boolean_t is_force;
  92 
  93 /*PRINTFLIKE1*/
  94 static void
  95 vdev_error(const char *fmt, ...)
  96 {
  97         va_list ap;
  98 
  99         if (!error_seen) {
 100                 (void) fprintf(stderr, gettext("invalid vdev specification\n"));
 101                 if (!is_force)
 102                         (void) fprintf(stderr, gettext("use '-f' to override "
 103                             "the following errors:\n"));
 104                 else
 105                         (void) fprintf(stderr, gettext("the following errors "
 106                             "must be manually repaired:\n"));
 107                 error_seen = B_TRUE;
 108         }
 109 
 110         va_start(ap, fmt);
 111         (void) vfprintf(stderr, fmt, ap);
 112         va_end(ap);
 113 }
 114 
 115 static void
 116 libdiskmgt_error(int error)
 117 {
 118         /*
 119          * ENXIO/ENODEV is a valid error message if the device doesn't live in
 120          * /dev/dsk.  Don't bother printing an error message in this case.
 121          */
 122         if (error == ENXIO || error == ENODEV)
 123                 return;
 124 
 125         (void) fprintf(stderr, gettext("warning: device in use checking "
 126             "failed: %s\n"), strerror(error));
 127 }
 128 
 129 /*
 130  * Validate a device, passing the bulk of the work off to libdiskmgt.
 131  */
 132 static int
 133 check_slice(const char *path, int force, boolean_t wholedisk, boolean_t isspare)
 134 {
 135         char *msg;
 136         int error = 0;
 137         dm_who_type_t who;
 138 
 139         if (force)
 140                 who = DM_WHO_ZPOOL_FORCE;
 141         else if (isspare)
 142                 who = DM_WHO_ZPOOL_SPARE;
 143         else
 144                 who = DM_WHO_ZPOOL;
 145 
 146         if (dm_inuse((char *)path, &msg, who, &error) || error) {
 147                 if (error != 0) {
 148                         libdiskmgt_error(error);
 149                         return (0);
 150                 } else {
 151                         vdev_error("%s", msg);
 152                         free(msg);
 153                         return (-1);
 154                 }
 155         }
 156 
 157         /*
 158          * If we're given a whole disk, ignore overlapping slices since we're
 159          * about to label it anyway.
 160          */
 161         error = 0;
 162         if (!wholedisk && !force &&
 163             (dm_isoverlapping((char *)path, &msg, &error) || error)) {
 164                 if (error == 0) {
 165                         /* dm_isoverlapping returned -1 */
 166                         vdev_error(gettext("%s overlaps with %s\n"), path, msg);
 167                         free(msg);
 168                         return (-1);
 169                 } else if (error != ENODEV) {
 170                         /* libdiskmgt's devcache only handles physical drives */
 171                         libdiskmgt_error(error);
 172                         return (0);
 173                 }
 174         }
 175 
 176         return (0);
 177 }
 178 
 179 
 180 /*
 181  * Validate a whole disk.  Iterate over all slices on the disk and make sure
 182  * that none is in use by calling check_slice().
 183  */
 184 static int
 185 check_disk(const char *name, dm_descriptor_t disk, int force, int isspare)
 186 {
 187         dm_descriptor_t *drive, *media, *slice;
 188         int err = 0;
 189         int i;
 190         int ret;
 191 
 192         /*
 193          * Get the drive associated with this disk.  This should never fail,
 194          * because we already have an alias handle open for the device.
 195          */
 196         if ((drive = dm_get_associated_descriptors(disk, DM_DRIVE,
 197             &err)) == NULL || *drive == NULL) {
 198                 if (err)
 199                         libdiskmgt_error(err);
 200                 return (0);
 201         }
 202 
 203         if ((media = dm_get_associated_descriptors(*drive, DM_MEDIA,
 204             &err)) == NULL) {
 205                 dm_free_descriptors(drive);
 206                 if (err)
 207                         libdiskmgt_error(err);
 208                 return (0);
 209         }
 210 
 211         dm_free_descriptors(drive);
 212 
 213         /*
 214          * It is possible that the user has specified a removable media drive,
 215          * and the media is not present.
 216          */
 217         if (*media == NULL) {
 218                 dm_free_descriptors(media);
 219                 vdev_error(gettext("'%s' has no media in drive\n"), name);
 220                 return (-1);
 221         }
 222 
 223         if ((slice = dm_get_associated_descriptors(*media, DM_SLICE,
 224             &err)) == NULL) {
 225                 dm_free_descriptors(media);
 226                 if (err)
 227                         libdiskmgt_error(err);
 228                 return (0);
 229         }
 230 
 231         dm_free_descriptors(media);
 232 
 233         ret = 0;
 234 
 235         /*
 236          * Iterate over all slices and report any errors.  We don't care about
 237          * overlapping slices because we are using the whole disk.
 238          */
 239         for (i = 0; slice[i] != NULL; i++) {
 240                 char *name = dm_get_name(slice[i], &err);
 241 
 242                 if (check_slice(name, force, B_TRUE, isspare) != 0)
 243                         ret = -1;
 244 
 245                 dm_free_name(name);
 246         }
 247 
 248         dm_free_descriptors(slice);
 249         return (ret);
 250 }
 251 
 252 /*
 253  * Validate a device.
 254  */
 255 static int
 256 check_device(const char *path, boolean_t force, boolean_t isspare)
 257 {
 258         dm_descriptor_t desc;
 259         int err;
 260         char *dev;
 261 
 262         /*
 263          * For whole disks, libdiskmgt does not include the leading dev path.
 264          */
 265         dev = strrchr(path, '/');
 266         assert(dev != NULL);
 267         dev++;
 268         if ((desc = dm_get_descriptor_by_name(DM_ALIAS, dev, &err)) != NULL) {
 269                 err = check_disk(path, desc, force, isspare);
 270                 dm_free_descriptor(desc);
 271                 return (err);
 272         }
 273 
 274         return (check_slice(path, force, B_FALSE, isspare));
 275 }
 276 
 277 /*
 278  * Check that a file is valid.  All we can do in this case is check that it's
 279  * not in use by another pool, and not in use by swap.
 280  */
 281 static int
 282 check_file(const char *file, boolean_t force, boolean_t isspare)
 283 {
 284         char  *name;
 285         int fd;
 286         int ret = 0;
 287         int err;
 288         pool_state_t state;
 289         boolean_t inuse;
 290 
 291         if (dm_inuse_swap(file, &err)) {
 292                 if (err)
 293                         libdiskmgt_error(err);
 294                 else
 295                         vdev_error(gettext("%s is currently used by swap. "
 296                             "Please see swap(1M).\n"), file);
 297                 return (-1);
 298         }
 299 
 300         if ((fd = open(file, O_RDONLY)) < 0)
 301                 return (0);
 302 
 303         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) {
 304                 const char *desc;
 305 
 306                 switch (state) {
 307                 case POOL_STATE_ACTIVE:
 308                         desc = gettext("active");
 309                         break;
 310 
 311                 case POOL_STATE_EXPORTED:
 312                         desc = gettext("exported");
 313                         break;
 314 
 315                 case POOL_STATE_POTENTIALLY_ACTIVE:
 316                         desc = gettext("potentially active");
 317                         break;
 318 
 319                 default:
 320                         desc = gettext("unknown");
 321                         break;
 322                 }
 323 
 324                 /*
 325                  * Allow hot spares to be shared between pools.
 326                  */
 327                 if (state == POOL_STATE_SPARE && isspare)
 328                         return (0);
 329 
 330                 if (state == POOL_STATE_ACTIVE ||
 331                     state == POOL_STATE_SPARE || !force) {
 332                         switch (state) {
 333                         case POOL_STATE_SPARE:
 334                                 vdev_error(gettext("%s is reserved as a hot "
 335                                     "spare for pool %s\n"), file, name);
 336                                 break;
 337                         default:
 338                                 vdev_error(gettext("%s is part of %s pool "
 339                                     "'%s'\n"), file, desc, name);
 340                                 break;
 341                         }
 342                         ret = -1;
 343                 }
 344 
 345                 free(name);
 346         }
 347 
 348         (void) close(fd);
 349         return (ret);
 350 }
 351 
 352 
 353 /*
 354  * By "whole disk" we mean an entire physical disk (something we can
 355  * label, toggle the write cache on, etc.) as opposed to the full
 356  * capacity of a pseudo-device such as lofi or did.  We act as if we
 357  * are labeling the disk, which should be a pretty good test of whether
 358  * it's a viable device or not.  Returns B_TRUE if it is and B_FALSE if
 359  * it isn't.
 360  */
 361 static boolean_t
 362 is_whole_disk(const char *arg)
 363 {
 364         struct dk_gpt *label;
 365         int     fd;
 366         char    path[MAXPATHLEN];
 367 
 368         (void) snprintf(path, sizeof (path), "%s%s%s",
 369             ZFS_RDISK_ROOT, strrchr(arg, '/'), BACKUP_SLICE);
 370         if ((fd = open(path, O_RDWR | O_NDELAY)) < 0)
 371                 return (B_FALSE);
 372         if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) {
 373                 (void) close(fd);
 374                 return (B_FALSE);
 375         }
 376         efi_free(label);
 377         (void) close(fd);
 378         return (B_TRUE);
 379 }
 380 
 381 /*
 382  * Create a leaf vdev.  Determine if this is a file or a device.  If it's a
 383  * device, fill in the device id to make a complete nvlist.  Valid forms for a
 384  * leaf vdev are:
 385  *
 386  *      /dev/dsk/xxx    Complete disk path
 387  *      /xxx            Full path to file
 388  *      xxx             Shorthand for /dev/dsk/xxx
 389  */
 390 static nvlist_t *
 391 make_leaf_vdev(const char *arg, uint64_t is_log, uint64_t is_special)
 392 {
 393         char path[MAXPATHLEN];
 394         struct stat64 statbuf;
 395         nvlist_t *vdev = NULL;
 396         char *type = NULL;
 397         boolean_t wholedisk = B_FALSE;
 398 
 399         /*
 400          * Determine what type of vdev this is, and put the full path into
 401          * 'path'.  We detect whether this is a device of file afterwards by
 402          * checking the st_mode of the file.
 403          */
 404         if (arg[0] == '/') {
 405                 /*
 406                  * Complete device or file path.  Exact type is determined by
 407                  * examining the file descriptor afterwards.
 408                  */
 409                 wholedisk = is_whole_disk(arg);
 410                 if (!wholedisk && (stat64(arg, &statbuf) != 0)) {
 411                         (void) fprintf(stderr,
 412                             gettext("cannot open '%s': %s\n"),
 413                             arg, strerror(errno));
 414                         return (NULL);
 415                 }
 416 
 417                 (void) strlcpy(path, arg, sizeof (path));
 418         } else {
 419                 /*
 420                  * This may be a short path for a device, or it could be total
 421                  * gibberish.  Check to see if it's a known device in
 422                  * /dev/dsk/.  As part of this check, see if we've been given a
 423                  * an entire disk (minus the slice number).
 424                  */
 425                 (void) snprintf(path, sizeof (path), "%s/%s", ZFS_DISK_ROOT,
 426                     arg);
 427                 wholedisk = is_whole_disk(path);
 428                 if (!wholedisk && (stat64(path, &statbuf) != 0)) {
 429                         /*
 430                          * If we got ENOENT, then the user gave us
 431                          * gibberish, so try to direct them with a
 432                          * reasonable error message.  Otherwise,
 433                          * regurgitate strerror() since it's the best we
 434                          * can do.
 435                          */
 436                         if (errno == ENOENT) {
 437                                 (void) fprintf(stderr,
 438                                     gettext("cannot open '%s': no such "
 439                                     "device in %s\n"), arg, ZFS_DISK_ROOT);
 440                                 (void) fprintf(stderr,
 441                                     gettext("must be a full path or "
 442                                     "shorthand device name\n"));
 443                                 return (NULL);
 444                         } else {
 445                                 (void) fprintf(stderr,
 446                                     gettext("cannot open '%s': %s\n"),
 447                                     path, strerror(errno));
 448                                 return (NULL);
 449                         }
 450                 }
 451         }
 452 
 453         /*
 454          * Determine whether this is a device or a file.
 455          */
 456         if (wholedisk || S_ISBLK(statbuf.st_mode)) {
 457                 type = VDEV_TYPE_DISK;
 458         } else if (S_ISREG(statbuf.st_mode)) {
 459                 type = VDEV_TYPE_FILE;
 460         } else {
 461                 (void) fprintf(stderr, gettext("cannot use '%s': must be a "
 462                     "block device or regular file\n"), path);
 463                 return (NULL);
 464         }
 465 
 466         /*
 467          * Finally, we have the complete device or file, and we know that it is
 468          * acceptable to use.  Construct the nvlist to describe this vdev.  All
 469          * vdevs have a 'path' element, and devices also have a 'devid' element.
 470          */
 471         verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0);
 472         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0);
 473         verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0);
 474         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0);
 475         verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_SPECIAL,
 476             is_special) == 0);
 477         if (strcmp(type, VDEV_TYPE_DISK) == 0)
 478                 verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK,
 479                     (uint64_t)wholedisk) == 0);
 480 
 481         /*
 482          * For a whole disk, defer getting its devid until after labeling it.
 483          */
 484         if (S_ISBLK(statbuf.st_mode) && !wholedisk) {
 485                 /*
 486                  * Get the devid for the device.
 487                  */
 488                 int fd;
 489                 ddi_devid_t devid;
 490                 char *minor = NULL, *devid_str = NULL;
 491 
 492                 if ((fd = open(path, O_RDONLY)) < 0) {
 493                         (void) fprintf(stderr, gettext("cannot open '%s': "
 494                             "%s\n"), path, strerror(errno));
 495                         nvlist_free(vdev);
 496                         return (NULL);
 497                 }
 498 
 499                 if (devid_get(fd, &devid) == 0) {
 500                         if (devid_get_minor_name(fd, &minor) == 0 &&
 501                             (devid_str = devid_str_encode(devid, minor)) !=
 502                             NULL) {
 503                                 verify(nvlist_add_string(vdev,
 504                                     ZPOOL_CONFIG_DEVID, devid_str) == 0);
 505                         }
 506                         if (devid_str != NULL)
 507                                 devid_str_free(devid_str);
 508                         if (minor != NULL)
 509                                 devid_str_free(minor);
 510                         devid_free(devid);
 511                 }
 512 
 513                 (void) close(fd);
 514         }
 515 
 516         return (vdev);
 517 }
 518 
 519 /*
 520  * Go through and verify the replication level of the pool is consistent.
 521  * Performs the following checks:
 522  *
 523  *      For the new spec, verifies that devices in mirrors and raidz are the
 524  *      same size.
 525  *
 526  *      If the current configuration already has inconsistent replication
 527  *      levels, ignore any other potential problems in the new spec.
 528  *
 529  *      Otherwise, make sure that the current spec (if there is one) and the new
 530  *      spec have consistent replication levels.
 531  */
 532 typedef struct replication_level {
 533         char *zprl_type;
 534         uint64_t zprl_children;
 535         uint64_t zprl_parity;
 536 } replication_level_t;
 537 
 538 #define ZPOOL_FUZZ      (16 * 1024 * 1024)
 539 
 540 /*
 541  * Given a list of toplevel vdevs, return the current replication level.  If
 542  * the config is inconsistent, then NULL is returned.  If 'fatal' is set, then
 543  * an error message will be displayed for each self-inconsistent vdev.
 544  */
 545 static replication_level_t *
 546 get_replication(nvlist_t *nvroot, boolean_t fatal)
 547 {
 548         nvlist_t **top;
 549         uint_t t, toplevels;
 550         nvlist_t **child;
 551         uint_t c, children;
 552         nvlist_t *nv;
 553         char *type;
 554         replication_level_t lastrep = {0};
 555         replication_level_t rep;
 556         replication_level_t *ret;
 557         boolean_t dontreport;
 558 
 559         ret = safe_malloc(sizeof (replication_level_t));
 560 
 561         verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 562             &top, &toplevels) == 0);
 563 
 564         for (t = 0; t < toplevels; t++) {
 565                 uint64_t is_log = B_FALSE;
 566 
 567                 nv = top[t];
 568 
 569                 /*
 570                  * For separate logs we ignore the top level vdev replication
 571                  * constraints.
 572                  */
 573                 (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log);
 574                 if (is_log)
 575                         continue;
 576 
 577                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE,
 578                     &type) == 0);
 579                 if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 580                     &child, &children) != 0) {
 581                         /*
 582                          * This is a 'file' or 'disk' vdev.
 583                          */
 584                         rep.zprl_type = type;
 585                         rep.zprl_children = 1;
 586                         rep.zprl_parity = 0;
 587                 } else {
 588                         uint64_t vdev_size;
 589 
 590                         /*
 591                          * This is a mirror or RAID-Z vdev.  Go through and make
 592                          * sure the contents are all the same (files vs. disks),
 593                          * keeping track of the number of elements in the
 594                          * process.
 595                          *
 596                          * We also check that the size of each vdev (if it can
 597                          * be determined) is the same.
 598                          */
 599                         rep.zprl_type = type;
 600                         rep.zprl_children = 0;
 601 
 602                         if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
 603                                 verify(nvlist_lookup_uint64(nv,
 604                                     ZPOOL_CONFIG_NPARITY,
 605                                     &rep.zprl_parity) == 0);
 606                                 assert(rep.zprl_parity != 0);
 607                         } else {
 608                                 rep.zprl_parity = 0;
 609                         }
 610 
 611                         /*
 612                          * The 'dontreport' variable indicates that we've
 613                          * already reported an error for this spec, so don't
 614                          * bother doing it again.
 615                          */
 616                         type = NULL;
 617                         dontreport = 0;
 618                         vdev_size = -1ULL;
 619                         for (c = 0; c < children; c++) {
 620                                 nvlist_t *cnv = child[c];
 621                                 char *path;
 622                                 struct stat64 statbuf;
 623                                 uint64_t size = -1ULL;
 624                                 char *childtype;
 625                                 int fd, err;
 626 
 627                                 rep.zprl_children++;
 628 
 629                                 verify(nvlist_lookup_string(cnv,
 630                                     ZPOOL_CONFIG_TYPE, &childtype) == 0);
 631 
 632                                 /*
 633                                  * If this is a replacing or spare vdev, then
 634                                  * get the real first child of the vdev.
 635                                  */
 636                                 if (strcmp(childtype,
 637                                     VDEV_TYPE_REPLACING) == 0 ||
 638                                     strcmp(childtype, VDEV_TYPE_SPARE) == 0) {
 639                                         nvlist_t **rchild;
 640                                         uint_t rchildren;
 641 
 642                                         verify(nvlist_lookup_nvlist_array(cnv,
 643                                             ZPOOL_CONFIG_CHILDREN, &rchild,
 644                                             &rchildren) == 0);
 645                                         assert(rchildren == 2);
 646                                         cnv = rchild[0];
 647 
 648                                         verify(nvlist_lookup_string(cnv,
 649                                             ZPOOL_CONFIG_TYPE,
 650                                             &childtype) == 0);
 651                                 }
 652 
 653                                 verify(nvlist_lookup_string(cnv,
 654                                     ZPOOL_CONFIG_PATH, &path) == 0);
 655 
 656                                 /*
 657                                  * If we have a raidz/mirror that combines disks
 658                                  * with files, report it as an error.
 659                                  */
 660                                 if (!dontreport && type != NULL &&
 661                                     strcmp(type, childtype) != 0) {
 662                                         if (ret != NULL)
 663                                                 free(ret);
 664                                         ret = NULL;
 665                                         if (fatal)
 666                                                 vdev_error(gettext(
 667                                                     "mismatched replication "
 668                                                     "level: %s contains both "
 669                                                     "files and devices\n"),
 670                                                     rep.zprl_type);
 671                                         else
 672                                                 return (NULL);
 673                                         dontreport = B_TRUE;
 674                                 }
 675 
 676                                 /*
 677                                  * According to stat(2), the value of 'st_size'
 678                                  * is undefined for block devices and character
 679                                  * devices.  But there is no effective way to
 680                                  * determine the real size in userland.
 681                                  *
 682                                  * Instead, we'll take advantage of an
 683                                  * implementation detail of spec_size().  If the
 684                                  * device is currently open, then we (should)
 685                                  * return a valid size.
 686                                  *
 687                                  * If we still don't get a valid size (indicated
 688                                  * by a size of 0 or MAXOFFSET_T), then ignore
 689                                  * this device altogether.
 690                                  */
 691                                 if ((fd = open(path, O_RDONLY)) >= 0) {
 692                                         err = fstat64(fd, &statbuf);
 693                                         (void) close(fd);
 694                                 } else {
 695                                         err = stat64(path, &statbuf);
 696                                 }
 697 
 698                                 if (err != 0 ||
 699                                     statbuf.st_size == 0 ||
 700                                     statbuf.st_size == MAXOFFSET_T)
 701                                         continue;
 702 
 703                                 size = statbuf.st_size;
 704 
 705                                 /*
 706                                  * Also make sure that devices and
 707                                  * slices have a consistent size.  If
 708                                  * they differ by a significant amount
 709                                  * (~16MB) then report an error.
 710                                  */
 711                                 if (!dontreport &&
 712                                     (vdev_size != -1ULL &&
 713                                     (labs(size - vdev_size) >
 714                                     ZPOOL_FUZZ))) {
 715                                         if (ret != NULL)
 716                                                 free(ret);
 717                                         ret = NULL;
 718                                         if (fatal)
 719                                                 vdev_error(gettext(
 720                                                     "%s contains devices of "
 721                                                     "different sizes\n"),
 722                                                     rep.zprl_type);
 723                                         else
 724                                                 return (NULL);
 725                                         dontreport = B_TRUE;
 726                                 }
 727 
 728                                 type = childtype;
 729                                 vdev_size = size;
 730                         }
 731                 }
 732 
 733                 /*
 734                  * At this point, we have the replication of the last toplevel
 735                  * vdev in 'rep'.  Compare it to 'lastrep' to see if its
 736                  * different.
 737                  */
 738                 if (lastrep.zprl_type != NULL) {
 739                         if (strcmp(lastrep.zprl_type, rep.zprl_type) != 0) {
 740                                 if (ret != NULL)
 741                                         free(ret);
 742                                 ret = NULL;
 743                                 if (fatal)
 744                                         vdev_error(gettext(
 745                                             "mismatched replication level: "
 746                                             "both %s and %s vdevs are "
 747                                             "present\n"),
 748                                             lastrep.zprl_type, rep.zprl_type);
 749                                 else
 750                                         return (NULL);
 751                         } else if (lastrep.zprl_parity != rep.zprl_parity) {
 752                                 if (ret)
 753                                         free(ret);
 754                                 ret = NULL;
 755                                 if (fatal)
 756                                         vdev_error(gettext(
 757                                             "mismatched replication level: "
 758                                             "both %llu and %llu device parity "
 759                                             "%s vdevs are present\n"),
 760                                             lastrep.zprl_parity,
 761                                             rep.zprl_parity,
 762                                             rep.zprl_type);
 763                                 else
 764                                         return (NULL);
 765                         } else if (lastrep.zprl_children != rep.zprl_children) {
 766                                 if (ret)
 767                                         free(ret);
 768                                 ret = NULL;
 769                                 if (fatal)
 770                                         vdev_error(gettext(
 771                                             "mismatched replication level: "
 772                                             "both %llu-way and %llu-way %s "
 773                                             "vdevs are present\n"),
 774                                             lastrep.zprl_children,
 775                                             rep.zprl_children,
 776                                             rep.zprl_type);
 777                                 else
 778                                         return (NULL);
 779                         }
 780                 }
 781                 lastrep = rep;
 782         }
 783 
 784         if (ret != NULL)
 785                 *ret = rep;
 786 
 787         return (ret);
 788 }
 789 
 790 /*
 791  * Check the replication level of the vdev spec against the current pool.  Calls
 792  * get_replication() to make sure the new spec is self-consistent.  If the pool
 793  * has a consistent replication level, then we ignore any errors.  Otherwise,
 794  * report any difference between the two.
 795  */
 796 static int
 797 check_replication(nvlist_t *config, nvlist_t *newroot)
 798 {
 799         nvlist_t **child;
 800         uint_t  children;
 801         replication_level_t *current = NULL, *new;
 802         int ret;
 803 
 804         /*
 805          * If we have a current pool configuration, check to see if it's
 806          * self-consistent.  If not, simply return success.
 807          */
 808         if (config != NULL) {
 809                 nvlist_t *nvroot;
 810 
 811                 verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 812                     &nvroot) == 0);
 813                 if ((current = get_replication(nvroot, B_FALSE)) == NULL)
 814                         return (0);
 815         }
 816         /*
 817          * for spares there may be no children, and therefore no
 818          * replication level to check
 819          */
 820         if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN,
 821             &child, &children) != 0) || (children == 0)) {
 822                 free(current);
 823                 return (0);
 824         }
 825 
 826         /*
 827          * If all we have is logs then there's no replication level to check.
 828          */
 829         if (num_logs(newroot) == children) {
 830                 free(current);
 831                 return (0);
 832         }
 833 
 834         /*
 835          * Get the replication level of the new vdev spec, reporting any
 836          * inconsistencies found.
 837          */
 838         if ((new = get_replication(newroot, B_TRUE)) == NULL) {
 839                 free(current);
 840                 return (-1);
 841         }
 842 
 843         /*
 844          * Check to see if the new vdev spec matches the replication level of
 845          * the current pool.
 846          */
 847         ret = 0;
 848         if (current != NULL) {
 849                 if (strcmp(current->zprl_type, new->zprl_type) != 0) {
 850                         vdev_error(gettext(
 851                             "mismatched replication level: pool uses %s "
 852                             "and new vdev is %s\n"),
 853                             current->zprl_type, new->zprl_type);
 854                         ret = -1;
 855                 } else if (current->zprl_parity != new->zprl_parity) {
 856                         vdev_error(gettext(
 857                             "mismatched replication level: pool uses %llu "
 858                             "device parity and new vdev uses %llu\n"),
 859                             current->zprl_parity, new->zprl_parity);
 860                         ret = -1;
 861                 } else if (current->zprl_children != new->zprl_children) {
 862                         vdev_error(gettext(
 863                             "mismatched replication level: pool uses %llu-way "
 864                             "%s and new vdev uses %llu-way %s\n"),
 865                             current->zprl_children, current->zprl_type,
 866                             new->zprl_children, new->zprl_type);
 867                         ret = -1;
 868                 }
 869         }
 870 
 871         free(new);
 872         if (current != NULL)
 873                 free(current);
 874 
 875         return (ret);
 876 }
 877 
 878 /*
 879  * Go through and find any whole disks in the vdev specification, labelling them
 880  * as appropriate.  When constructing the vdev spec, we were unable to open this
 881  * device in order to provide a devid.  Now that we have labelled the disk and
 882  * know the pool slice is valid, we can construct the devid now.
 883  *
 884  * If the disk was already labeled with an EFI label, we will have gotten the
 885  * devid already (because we were able to open the whole disk).  Otherwise, we
 886  * need to get the devid after we label the disk.
 887  */
 888 static int
 889 make_disks(zpool_handle_t *zhp, nvlist_t *nv, zpool_boot_label_t boot_type,
 890     uint64_t boot_size)
 891 {
 892         nvlist_t **child;
 893         uint_t c, children;
 894         char *type, *path, *diskname;
 895         char buf[MAXPATHLEN];
 896         uint64_t wholedisk;
 897         int fd;
 898         int ret;
 899         int slice;
 900         ddi_devid_t devid;
 901         char *minor = NULL, *devid_str = NULL;
 902 
 903         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
 904 
 905         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 906             &child, &children) != 0) {
 907 
 908                 if (strcmp(type, VDEV_TYPE_DISK) != 0)
 909                         return (0);
 910 
 911                 /*
 912                  * We have a disk device.  Get the path to the device
 913                  * and see if it's a whole disk by appending the backup
 914                  * slice and stat()ing the device.
 915                  */
 916                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
 917 
 918                 diskname = strrchr(path, '/');
 919                 assert(diskname != NULL);
 920                 diskname++;
 921 
 922                 if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 923                     &wholedisk) != 0 || !wholedisk) {
 924                         /*
 925                          * This is not whole disk, return error if
 926                          * boot partition creation was requested
 927                          */
 928                         if (boot_type == ZPOOL_CREATE_BOOT_LABEL) {
 929                                 (void) fprintf(stderr,
 930                                     gettext("creating boot partition is only "
 931                                     "supported on whole disk vdevs: %s\n"),
 932                                     diskname);
 933                                 return (-1);
 934                         }
 935                         return (0);
 936                 }
 937 
 938                 ret = zpool_label_disk(g_zfs, zhp, diskname, boot_type,
 939                     boot_size, &slice);
 940                 if (ret == -1)
 941                         return (ret);
 942 
 943                 /*
 944                  * Fill in the devid, now that we've labeled the disk.
 945                  */
 946                 (void) snprintf(buf, sizeof (buf), "%ss%d", path, slice);
 947                 if ((fd = open(buf, O_RDONLY)) < 0) {
 948                         (void) fprintf(stderr,
 949                             gettext("cannot open '%s': %s\n"),
 950                             buf, strerror(errno));
 951                         return (-1);
 952                 }
 953 
 954                 if (devid_get(fd, &devid) == 0) {
 955                         if (devid_get_minor_name(fd, &minor) == 0 &&
 956                             (devid_str = devid_str_encode(devid, minor)) !=
 957                             NULL) {
 958                                 verify(nvlist_add_string(nv,
 959                                     ZPOOL_CONFIG_DEVID, devid_str) == 0);
 960                         }
 961                         if (devid_str != NULL)
 962                                 devid_str_free(devid_str);
 963                         if (minor != NULL)
 964                                 devid_str_free(minor);
 965                         devid_free(devid);
 966                 }
 967 
 968                 /*
 969                  * Update the path to refer to the pool slice.  The presence of
 970                  * the 'whole_disk' field indicates to the CLI that we should
 971                  * chop off the slice number when displaying the device in
 972                  * future output.
 973                  */
 974                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, buf) == 0);
 975 
 976                 (void) close(fd);
 977 
 978                 return (0);
 979         }
 980 
 981         /* illumos kernel does not support booting from multi-vdev pools. */
 982         if ((boot_type == ZPOOL_CREATE_BOOT_LABEL)) {
 983                 if ((strcmp(type, VDEV_TYPE_ROOT) == 0) && children > 1) {
 984                         (void) fprintf(stderr, gettext("boot pool "
 985                             "can not have more than one vdev\n"));
 986                         return (-1);
 987                 }
 988         }
 989 
 990         for (c = 0; c < children; c++) {
 991                 ret = make_disks(zhp, child[c], boot_type, boot_size);
 992                 if (ret != 0)
 993                         return (ret);
 994         }
 995 
 996         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 997             &child, &children) == 0)
 998                 for (c = 0; c < children; c++) {
 999                         ret = make_disks(zhp, child[c], boot_type, boot_size);
1000                         if (ret != 0)
1001                                 return (ret);
1002                 }
1003 
1004         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1005             &child, &children) == 0)
1006                 for (c = 0; c < children; c++) {
1007                         ret = make_disks(zhp, child[c], boot_type, boot_size);
1008                         if (ret != 0)
1009                                 return (ret);
1010                 }
1011 
1012         return (0);
1013 }
1014 
1015 /*
1016  * Determine if the given path is a hot spare within the given configuration.
1017  */
1018 static boolean_t
1019 is_spare(nvlist_t *config, const char *path)
1020 {
1021         int fd;
1022         pool_state_t state;
1023         char *name = NULL;
1024         nvlist_t *label;
1025         uint64_t guid, spareguid;
1026         nvlist_t *nvroot;
1027         nvlist_t **spares;
1028         uint_t i, nspares;
1029         boolean_t inuse;
1030 
1031         if ((fd = open(path, O_RDONLY)) < 0)
1032                 return (B_FALSE);
1033 
1034         if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 ||
1035             !inuse ||
1036             state != POOL_STATE_SPARE ||
1037             zpool_read_label(fd, &label) != 0) {
1038                 free(name);
1039                 (void) close(fd);
1040                 return (B_FALSE);
1041         }
1042         free(name);
1043         (void) close(fd);
1044 
1045         verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0);
1046         nvlist_free(label);
1047 
1048         verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
1049             &nvroot) == 0);
1050         if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1051             &spares, &nspares) == 0) {
1052                 for (i = 0; i < nspares; i++) {
1053                         verify(nvlist_lookup_uint64(spares[i],
1054                             ZPOOL_CONFIG_GUID, &spareguid) == 0);
1055                         if (spareguid == guid)
1056                                 return (B_TRUE);
1057                 }
1058         }
1059 
1060         return (B_FALSE);
1061 }
1062 
1063 /*
1064  * Go through and find any devices that are in use.  We rely on libdiskmgt for
1065  * the majority of this task.
1066  */
1067 static boolean_t
1068 is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force,
1069     boolean_t replacing, boolean_t isspare)
1070 {
1071         nvlist_t **child;
1072         uint_t c, children;
1073         char *type, *path;
1074         int ret = 0;
1075         char buf[MAXPATHLEN];
1076         uint64_t wholedisk;
1077         boolean_t anyinuse = B_FALSE;
1078 
1079         verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
1080 
1081         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
1082             &child, &children) != 0) {
1083 
1084                 verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
1085 
1086                 /*
1087                  * As a generic check, we look to see if this is a replace of a
1088                  * hot spare within the same pool.  If so, we allow it
1089                  * regardless of what libdiskmgt or zpool_in_use() says.
1090                  */
1091                 if (replacing) {
1092                         if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
1093                             &wholedisk) == 0 && wholedisk)
1094                                 (void) snprintf(buf, sizeof (buf), "%ss0",
1095                                     path);
1096                         else
1097                                 (void) strlcpy(buf, path, sizeof (buf));
1098 
1099                         if (is_spare(config, buf))
1100                                 return (B_FALSE);
1101                 }
1102 
1103                 if (strcmp(type, VDEV_TYPE_DISK) == 0)
1104                         ret = check_device(path, force, isspare);
1105                 else if (strcmp(type, VDEV_TYPE_FILE) == 0)
1106                         ret = check_file(path, force, isspare);
1107 
1108                 return (ret != 0);
1109         }
1110 
1111         for (c = 0; c < children; c++)
1112                 if (is_device_in_use(config, child[c], force, replacing,
1113                     B_FALSE))
1114                         anyinuse = B_TRUE;
1115 
1116         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
1117             &child, &children) == 0)
1118                 for (c = 0; c < children; c++)
1119                         if (is_device_in_use(config, child[c], force, replacing,
1120                             B_TRUE))
1121                                 anyinuse = B_TRUE;
1122 
1123         if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
1124             &child, &children) == 0)
1125                 for (c = 0; c < children; c++)
1126                         if (is_device_in_use(config, child[c], force, replacing,
1127                             B_FALSE))
1128                                 anyinuse = B_TRUE;
1129 
1130         return (anyinuse);
1131 }
1132 
1133 static const char *
1134 is_grouping(const char *type, int *mindev, int *maxdev)
1135 {
1136         if (strncmp(type, "raidz", 5) == 0) {
1137                 const char *p = type + 5;
1138                 char *end;
1139                 long nparity;
1140 
1141                 if (*p == '\0') {
1142                         nparity = 1;
1143                 } else if (*p == '0') {
1144                         return (NULL); /* no zero prefixes allowed */
1145                 } else {
1146                         errno = 0;
1147                         nparity = strtol(p, &end, 10);
1148                         if (errno != 0 || nparity < 1 || nparity >= 255 ||
1149                             *end != '\0')
1150                                 return (NULL);
1151                 }
1152 
1153                 if (mindev != NULL)
1154                         *mindev = nparity + 1;
1155                 if (maxdev != NULL)
1156                         *maxdev = 255;
1157                 return (VDEV_TYPE_RAIDZ);
1158         }
1159 
1160         if (maxdev != NULL)
1161                 *maxdev = INT_MAX;
1162 
1163         if (strcmp(type, "mirror") == 0) {
1164                 if (mindev != NULL)
1165                         *mindev = 2;
1166                 return (VDEV_TYPE_MIRROR);
1167         }
1168 
1169         if (strcmp(type, "spare") == 0) {
1170                 if (mindev != NULL)
1171                         *mindev = 1;
1172                 return (VDEV_TYPE_SPARE);
1173         }
1174 
1175         if (strcmp(type, "log") == 0) {
1176                 if (mindev != NULL)
1177                         *mindev = 1;
1178                 return (VDEV_TYPE_LOG);
1179         }
1180 
1181         if (strcmp(type, "cache") == 0) {
1182                 if (mindev != NULL)
1183                         *mindev = 1;
1184                 return (VDEV_TYPE_L2CACHE);
1185         }
1186 
1187         if (strcmp(type, "special") == 0) {
1188                 if (mindev != NULL)
1189                         *mindev = 1;
1190                 return (VDEV_TYPE_SPECIAL);
1191         }
1192 
1193         return (NULL);
1194 }
1195 
1196 /*
1197  * Construct a syntactically valid vdev specification,
1198  * and ensure that all devices and files exist and can be opened.
1199  * Note: we don't bother freeing anything in the error paths
1200  * because the program is just going to exit anyway.
1201  */
1202 nvlist_t *
1203 construct_spec(int argc, char **argv)
1204 {
1205         nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
1206         int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
1207         int nspecial = 0;
1208         const char *type;
1209         boolean_t is_log, seen_logs;
1210         boolean_t is_special, seen_special;
1211 
1212         top = NULL;
1213         toplevels = 0;
1214         spares = NULL;
1215         l2cache = NULL;
1216         nspares = 0;
1217         nlogs = 0;
1218         nl2cache = 0;
1219         is_log = B_FALSE;
1220         seen_logs = B_FALSE;
1221         is_special = B_FALSE;
1222         seen_special = B_FALSE;
1223 
1224         while (argc > 0) {
1225                 nv = NULL;
1226 
1227                 /*
1228                  * If it's a mirror or raidz, the subsequent arguments are
1229                  * its leaves -- until we encounter the next mirror or raidz.
1230                  */
1231                 if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
1232                         nvlist_t **child = NULL;
1233                         int c, children = 0;
1234 
1235                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1236                                 if (spares != NULL) {
1237                                         (void) fprintf(stderr,
1238                                             gettext("invalid vdev "
1239                                             "specification: 'spare' can be "
1240                                             "specified only once\n"));
1241                                         return (NULL);
1242                                 }
1243                                 is_log = B_FALSE;
1244                                 is_special = B_FALSE;
1245                         }
1246 
1247                         if (strcmp(type, VDEV_TYPE_LOG) == 0) {
1248                                 if (seen_logs) {
1249                                         (void) fprintf(stderr,
1250                                             gettext("invalid vdev "
1251                                             "specification: 'log' can be "
1252                                             "specified only once\n"));
1253                                         return (NULL);
1254                                 }
1255                                 seen_logs = B_TRUE;
1256                                 is_log = B_TRUE;
1257                                 is_special = B_FALSE;
1258                                 argc--;
1259                                 argv++;
1260                                 /*
1261                                  * A log is not a real grouping device.
1262                                  * We just set is_log and continue.
1263                                  */
1264                                 continue;
1265                         }
1266 
1267                         if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1268                                 if (l2cache != NULL) {
1269                                         (void) fprintf(stderr,
1270                                             gettext("invalid vdev "
1271                                             "specification: 'cache' can be "
1272                                             "specified only once\n"));
1273                                         return (NULL);
1274                                 }
1275                                 is_log = B_FALSE;
1276                                 is_special = B_FALSE;
1277                         }
1278 
1279                         if (strcmp(type, VDEV_TYPE_SPECIAL) == 0) {
1280                                 if (seen_special) {
1281                                         (void) fprintf(stderr,
1282                                             gettext("invalid vdev "
1283                                             "specification: 'special' can be "
1284                                             "specified only once\n"));
1285                                         return (NULL);
1286                                 }
1287                                 seen_special = B_TRUE;
1288                                 is_log = B_FALSE;
1289                                 is_special = B_TRUE;
1290                                 argc--;
1291                                 argv++;
1292                                 /*
1293                                  * A special is not a real grouping device.
1294                                  * We just set is_special and continue.
1295                                  */
1296                                 continue;
1297                         }
1298 
1299                         if (is_log) {
1300                                 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1301                                         (void) fprintf(stderr,
1302                                             gettext("invalid vdev "
1303                                             "specification: unsupported 'log' "
1304                                             "device: %s\n"), type);
1305                                         return (NULL);
1306                                 }
1307                                 nlogs++;
1308                         }
1309 
1310                         if (is_special) {
1311                                 if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
1312                                         (void) fprintf(stderr,
1313                                             gettext("invalid vdev "
1314                                             "specification: unsupported "
1315                                             "'special' device: %s\n"), type);
1316                                         return (NULL);
1317                                 }
1318                                 nspecial++;
1319                         }
1320 
1321                         for (c = 1; c < argc; c++) {
1322                                 if (is_grouping(argv[c], NULL, NULL) != NULL)
1323                                         break;
1324                                 children++;
1325                                 child = realloc(child,
1326                                     children * sizeof (nvlist_t *));
1327                                 if (child == NULL)
1328                                         zpool_no_memory();
1329                                 if ((nv = make_leaf_vdev(argv[c],
1330                                     (uint64_t)B_FALSE,
1331                                     (uint64_t)B_FALSE)) == NULL)
1332                                         return (NULL);
1333                                 child[children - 1] = nv;
1334                         }
1335 
1336                         if (children < mindev) {
1337                                 (void) fprintf(stderr, gettext("invalid vdev "
1338                                     "specification: %s requires at least %d "
1339                                     "devices\n"), argv[0], mindev);
1340                                 return (NULL);
1341                         }
1342 
1343                         if (children > maxdev) {
1344                                 (void) fprintf(stderr, gettext("invalid vdev "
1345                                     "specification: %s supports no more than "
1346                                     "%d devices\n"), argv[0], maxdev);
1347                                 return (NULL);
1348                         }
1349 
1350                         argc -= c;
1351                         argv += c;
1352 
1353                         if (strcmp(type, VDEV_TYPE_SPARE) == 0) {
1354                                 spares = child;
1355                                 nspares = children;
1356                                 continue;
1357                         } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) {
1358                                 l2cache = child;
1359                                 nl2cache = children;
1360                                 continue;
1361                         } else {
1362                                 verify(nvlist_alloc(&nv, NV_UNIQUE_NAME,
1363                                     0) == 0);
1364                                 verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE,
1365                                     type) == 0);
1366                                 verify(nvlist_add_uint64(nv,
1367                                     ZPOOL_CONFIG_IS_LOG,
1368                                     (uint64_t)is_log) == 0);
1369                                 verify(nvlist_add_uint64(nv,
1370                                     ZPOOL_CONFIG_IS_SPECIAL,
1371                                     (uint64_t)is_special) == 0);
1372                                 if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) {
1373                                         verify(nvlist_add_uint64(nv,
1374                                             ZPOOL_CONFIG_NPARITY,
1375                                             mindev - 1) == 0);
1376                                 }
1377                                 verify(nvlist_add_nvlist_array(nv,
1378                                     ZPOOL_CONFIG_CHILDREN, child,
1379                                     children) == 0);
1380 
1381                                 for (c = 0; c < children; c++)
1382                                         nvlist_free(child[c]);
1383                                 free(child);
1384                         }
1385                 } else {
1386                         /*
1387                          * We have a device.  Pass off to make_leaf_vdev() to
1388                          * construct the appropriate nvlist describing the vdev.
1389                          */
1390                         if ((nv = make_leaf_vdev(argv[0], (uint64_t)is_log,
1391                             (uint64_t)is_special)) == NULL)
1392                                 return (NULL);
1393                         if (is_log)
1394                                 nlogs++;
1395                         if (is_special)
1396                                 nspecial++;
1397                         argc--;
1398                         argv++;
1399                 }
1400 
1401                 toplevels++;
1402                 top = realloc(top, toplevels * sizeof (nvlist_t *));
1403                 if (top == NULL)
1404                         zpool_no_memory();
1405                 top[toplevels - 1] = nv;
1406         }
1407 
1408         if (toplevels == 0 && nspares == 0 && nl2cache == 0) {
1409                 (void) fprintf(stderr, gettext("invalid vdev "
1410                     "specification: at least one toplevel vdev must be "
1411                     "specified\n"));
1412                 return (NULL);
1413         }
1414 
1415         if (seen_special && nspecial == 0) {
1416                 (void) fprintf(stderr, gettext("invalid vdev specification: "
1417                     "special requires at least 1 device\n"));
1418                 return (NULL);
1419         }
1420 
1421         if (seen_logs && nlogs == 0) {
1422                 (void) fprintf(stderr, gettext("invalid vdev specification: "
1423                     "log requires at least 1 device\n"));
1424                 return (NULL);
1425         }
1426 
1427         /*
1428          * Finally, create nvroot and add all top-level vdevs to it.
1429          */
1430         verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0);
1431         verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
1432             VDEV_TYPE_ROOT) == 0);
1433         verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
1434             top, toplevels) == 0);
1435         if (nspares != 0)
1436                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
1437                     spares, nspares) == 0);
1438         if (nl2cache != 0)
1439                 verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
1440                     l2cache, nl2cache) == 0);
1441 
1442         for (t = 0; t < toplevels; t++)
1443                 nvlist_free(top[t]);
1444         for (t = 0; t < nspares; t++)
1445                 nvlist_free(spares[t]);
1446         for (t = 0; t < nl2cache; t++)
1447                 nvlist_free(l2cache[t]);
1448         if (spares)
1449                 free(spares);
1450         if (l2cache)
1451                 free(l2cache);
1452         free(top);
1453 
1454         return (nvroot);
1455 }
1456 
1457 nvlist_t *
1458 split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
1459     splitflags_t flags, int argc, char **argv)
1460 {
1461         nvlist_t *newroot = NULL, **child;
1462         uint_t c, children;
1463         zpool_boot_label_t boot_type;
1464 
1465         if (argc > 0) {
1466                 if ((newroot = construct_spec(argc, argv)) == NULL) {
1467                         (void) fprintf(stderr, gettext("Unable to build a "
1468                             "pool from the specified devices\n"));
1469                         return (NULL);
1470                 }
1471 
1472                 if (zpool_is_bootable(zhp))
1473                         boot_type = ZPOOL_COPY_BOOT_LABEL;
1474                 else
1475                         boot_type = ZPOOL_NO_BOOT_LABEL;
1476 
1477                 if (!flags.dryrun &&
1478                     make_disks(zhp, newroot, boot_type, 0) != 0) {
1479                         nvlist_free(newroot);
1480                         return (NULL);
1481                 }
1482 
1483                 /* avoid any tricks in the spec */
1484                 verify(nvlist_lookup_nvlist_array(newroot,
1485                     ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
1486                 for (c = 0; c < children; c++) {
1487                         char *path;
1488                         const char *type;
1489                         int min, max;
1490 
1491                         verify(nvlist_lookup_string(child[c],
1492                             ZPOOL_CONFIG_PATH, &path) == 0);
1493                         if ((type = is_grouping(path, &min, &max)) != NULL) {
1494                                 (void) fprintf(stderr, gettext("Cannot use "
1495                                     "'%s' as a device for splitting\n"), type);
1496                                 nvlist_free(newroot);
1497                                 return (NULL);
1498                         }
1499                 }
1500         }
1501 
1502         if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
1503                 nvlist_free(newroot);
1504                 return (NULL);
1505         }
1506 
1507         return (newroot);
1508 }
1509 
1510 /*
1511  * Get and validate the contents of the given vdev specification.  This ensures
1512  * that the nvlist returned is well-formed, that all the devices exist, and that
1513  * they are not currently in use by any other known consumer.  The 'poolconfig'
1514  * parameter is the current configuration of the pool when adding devices
1515  * existing pool, and is used to perform additional checks, such as changing the
1516  * replication level of the pool.  It can be 'NULL' to indicate that this is a
1517  * new pool.  The 'force' flag controls whether devices should be forcefully
1518  * added, even if they appear in use.
1519  */
1520 nvlist_t *
1521 make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
1522     boolean_t replacing, boolean_t dryrun, zpool_boot_label_t boot_type,
1523     uint64_t boot_size, int argc, char **argv)
1524 {
1525         nvlist_t *newroot;
1526         nvlist_t *poolconfig = NULL;
1527         is_force = force;
1528 
1529         /*
1530          * Construct the vdev specification.  If this is successful, we know
1531          * that we have a valid specification, and that all devices can be
1532          * opened.
1533          */
1534         if ((newroot = construct_spec(argc, argv)) == NULL)
1535                 return (NULL);
1536 
1537         if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL))
1538                 return (NULL);
1539 
1540         /*
1541          * Validate each device to make sure that its not shared with another
1542          * subsystem.  We do this even if 'force' is set, because there are some
1543          * uses (such as a dedicated dump device) that even '-f' cannot
1544          * override.
1545          */
1546         if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) {
1547                 nvlist_free(newroot);
1548                 return (NULL);
1549         }
1550 
1551         /*
1552          * Check the replication level of the given vdevs and report any errors
1553          * found.  We include the existing pool spec, if any, as we need to
1554          * catch changes against the existing replication level.
1555          */
1556         if (check_rep && check_replication(poolconfig, newroot) != 0) {
1557                 nvlist_free(newroot);
1558                 return (NULL);
1559         }
1560 
1561         /*
1562          * Run through the vdev specification and label any whole disks found.
1563          */
1564         if (!dryrun && make_disks(zhp, newroot, boot_type, boot_size) != 0) {
1565                 nvlist_free(newroot);
1566                 return (NULL);
1567         }
1568 
1569         return (newroot);
1570 }