zfs-cherry Wdiff usr/src/uts/common/fs/zfs/zio.c

Print this page

6319 assertion failed in zio_ddt_write: bp->blk_birth == txg
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zio.c
          +++ new/usr/src/uts/common/fs/zfs/zio.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   * Copyright 2013 Joyent, Inc. All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/sysmacros.h>
  29   29  #include <sys/zfs_context.h>
  30   30  #include <sys/fm/fs/zfs.h>
  31   31  #include <sys/spa.h>
  32   32  #include <sys/txg.h>
  33   33  #include <sys/spa_impl.h>
  34   34  #include <sys/vdev_impl.h>
  35   35  #include <sys/zio_impl.h>
  36   36  #include <sys/zio_compress.h>
  37   37  #include <sys/zio_checksum.h>
  38   38  #include <sys/dmu_objset.h>
  39   39  #include <sys/arc.h>
  40   40  #include <sys/ddt.h>
  41   41  #include <sys/zfs_zone.h>
  42   42  #include <sys/blkptr.h>
  43   43  #include <sys/zfeature.h>
  44   44  
  45   45  /*
  46   46   * ==========================================================================
  47   47   * I/O type descriptions
  48   48   * ==========================================================================
  49   49   */
  50   50  const char *zio_type_name[ZIO_TYPES] = {
  51   51          "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  52   52          "zio_ioctl"
  53   53  };
  54   54  
  55   55  /*
  56   56   * ==========================================================================
  57   57   * I/O kmem caches
  58   58   * ==========================================================================
  59   59   */
  60   60  kmem_cache_t *zio_cache;
  61   61  kmem_cache_t *zio_link_cache;
  62   62  kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  63   63  kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  64   64  
  65   65  #ifdef _KERNEL
  66   66  extern vmem_t *zio_alloc_arena;
  67   67  #endif
  68   68  
  69   69  #define ZIO_PIPELINE_CONTINUE           0x100
  70   70  #define ZIO_PIPELINE_STOP               0x101
  71   71  
  72   72  /*
  73   73   * The following actions directly effect the spa's sync-to-convergence logic.
  74   74   * The values below define the sync pass when we start performing the action.
  75   75   * Care should be taken when changing these values as they directly impact
  76   76   * spa_sync() performance. Tuning these values may introduce subtle performance
  77   77   * pathologies and should only be done in the context of performance analysis.
  78   78   * These tunables will eventually be removed and replaced with #defines once
  79   79   * enough analysis has been done to determine optimal values.
  80   80   *
  81   81   * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
  82   82   * regular blocks are not deferred.
  83   83   */
  84   84  int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
  85   85  int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
  86   86  int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
  87   87  
  88   88  /*
  89   89   * An allocating zio is one that either currently has the DVA allocate
  90   90   * stage set or will have it later in its lifetime.
  91   91   */
  92   92  #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  93   93  
  94   94  boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
  95   95  
  96   96  #ifdef ZFS_DEBUG
  97   97  int zio_buf_debug_limit = 16384;
  98   98  #else
  99   99  int zio_buf_debug_limit = 0;
 100  100  #endif
 101  101  
 102  102  void
 103  103  zio_init(void)
 104  104  {
 105  105          size_t c;
 106  106          vmem_t *data_alloc_arena = NULL;
 107  107  
 108  108  #ifdef _KERNEL
 109  109          data_alloc_arena = zio_alloc_arena;
 110  110  #endif
 111  111          zio_cache = kmem_cache_create("zio_cache",
 112  112              sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 113  113          zio_link_cache = kmem_cache_create("zio_link_cache",
 114  114              sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 115  115  
 116  116          /*
 117  117           * For small buffers, we want a cache for each multiple of
 118  118           * SPA_MINBLOCKSIZE.  For larger buffers, we want a cache
 119  119           * for each quarter-power of 2.
 120  120           */
 121  121          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 122  122                  size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 123  123                  size_t p2 = size;
 124  124                  size_t align = 0;
 125  125                  size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 126  126  
 127  127                  while (!ISP2(p2))
 128  128                          p2 &= p2 - 1;
 129  129  
 130  130  #ifndef _KERNEL
 131  131                  /*
 132  132                   * If we are using watchpoints, put each buffer on its own page,
 133  133                   * to eliminate the performance overhead of trapping to the
 134  134                   * kernel when modifying a non-watched buffer that shares the
 135  135                   * page with a watched buffer.
 136  136                   */
 137  137                  if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE))
 138  138                          continue;
 139  139  #endif
 140  140                  if (size <= 4 * SPA_MINBLOCKSIZE) {
 141  141                          align = SPA_MINBLOCKSIZE;
 142  142                  } else if (IS_P2ALIGNED(size, p2 >> 2)) {
 143  143                          align = MIN(p2 >> 2, PAGESIZE);
 144  144                  }
 145  145  
 146  146                  if (align != 0) {
 147  147                          char name[36];
 148  148                          (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 149  149                          zio_buf_cache[c] = kmem_cache_create(name, size,
 150  150                              align, NULL, NULL, NULL, NULL, NULL, cflags);
 151  151  
 152  152                          /*
 153  153                           * Since zio_data bufs do not appear in crash dumps, we
 154  154                           * pass KMC_NOTOUCH so that no allocator metadata is
 155  155                           * stored with the buffers.
 156  156                           */
 157  157                          (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 158  158                          zio_data_buf_cache[c] = kmem_cache_create(name, size,
 159  159                              align, NULL, NULL, NULL, NULL, data_alloc_arena,
 160  160                              cflags | KMC_NOTOUCH);
 161  161                  }
 162  162          }
 163  163  
 164  164          while (--c != 0) {
 165  165                  ASSERT(zio_buf_cache[c] != NULL);
 166  166                  if (zio_buf_cache[c - 1] == NULL)
 167  167                          zio_buf_cache[c - 1] = zio_buf_cache[c];
 168  168  
 169  169                  ASSERT(zio_data_buf_cache[c] != NULL);
 170  170                  if (zio_data_buf_cache[c - 1] == NULL)
 171  171                          zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 172  172          }
 173  173  
 174  174          zio_inject_init();
 175  175  }
 176  176  
 177  177  void
 178  178  zio_fini(void)
 179  179  {
 180  180          size_t c;
 181  181          kmem_cache_t *last_cache = NULL;
 182  182          kmem_cache_t *last_data_cache = NULL;
 183  183  
 184  184          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 185  185                  if (zio_buf_cache[c] != last_cache) {
 186  186                          last_cache = zio_buf_cache[c];
 187  187                          kmem_cache_destroy(zio_buf_cache[c]);
 188  188                  }
 189  189                  zio_buf_cache[c] = NULL;
 190  190  
 191  191                  if (zio_data_buf_cache[c] != last_data_cache) {
 192  192                          last_data_cache = zio_data_buf_cache[c];
 193  193                          kmem_cache_destroy(zio_data_buf_cache[c]);
 194  194                  }
 195  195                  zio_data_buf_cache[c] = NULL;
 196  196          }
 197  197  
 198  198          kmem_cache_destroy(zio_link_cache);
 199  199          kmem_cache_destroy(zio_cache);
 200  200  
 201  201          zio_inject_fini();
 202  202  }
 203  203  
 204  204  /*
 205  205   * ==========================================================================
 206  206   * Allocate and free I/O buffers
 207  207   * ==========================================================================
 208  208   */
 209  209  
 210  210  /*
 211  211   * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 212  212   * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 213  213   * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 214  214   * excess / transient data in-core during a crashdump.
 215  215   */
 216  216  void *
 217  217  zio_buf_alloc(size_t size)
 218  218  {
 219  219          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 220  220  
 221  221          VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 222  222  
 223  223          return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 224  224  }
 225  225  
 226  226  /*
 227  227   * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 228  228   * crashdump if the kernel panics.  This exists so that we will limit the amount
 229  229   * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 230  230   * of kernel heap dumped to disk when the kernel panics)
 231  231   */
 232  232  void *
 233  233  zio_data_buf_alloc(size_t size)
 234  234  {
 235  235          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 236  236  
 237  237          VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 238  238  
 239  239          return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 240  240  }
 241  241  
 242  242  void
 243  243  zio_buf_free(void *buf, size_t size)
 244  244  {
 245  245          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 246  246  
 247  247          VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 248  248  
 249  249          kmem_cache_free(zio_buf_cache[c], buf);
 250  250  }
 251  251  
 252  252  void
 253  253  zio_data_buf_free(void *buf, size_t size)
 254  254  {
 255  255          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 256  256  
 257  257          VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 258  258  
 259  259          kmem_cache_free(zio_data_buf_cache[c], buf);
 260  260  }
 261  261  
 262  262  /*
 263  263   * ==========================================================================
 264  264   * Push and pop I/O transform buffers
 265  265   * ==========================================================================
 266  266   */
 267  267  static void
 268  268  zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 269  269          zio_transform_func_t *transform)
 270  270  {
 271  271          zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 272  272  
 273  273          zt->zt_orig_data = zio->io_data;
 274  274          zt->zt_orig_size = zio->io_size;
 275  275          zt->zt_bufsize = bufsize;
 276  276          zt->zt_transform = transform;
 277  277  
 278  278          zt->zt_next = zio->io_transform_stack;
 279  279          zio->io_transform_stack = zt;
 280  280  
 281  281          zio->io_data = data;
 282  282          zio->io_size = size;
 283  283  }
 284  284  
 285  285  static void
 286  286  zio_pop_transforms(zio_t *zio)
 287  287  {
 288  288          zio_transform_t *zt;
 289  289  
 290  290          while ((zt = zio->io_transform_stack) != NULL) {
 291  291                  if (zt->zt_transform != NULL)
 292  292                          zt->zt_transform(zio,
 293  293                              zt->zt_orig_data, zt->zt_orig_size);
 294  294  
 295  295                  if (zt->zt_bufsize != 0)
 296  296                          zio_buf_free(zio->io_data, zt->zt_bufsize);
 297  297  
 298  298                  zio->io_data = zt->zt_orig_data;
 299  299                  zio->io_size = zt->zt_orig_size;
 300  300                  zio->io_transform_stack = zt->zt_next;
 301  301  
 302  302                  kmem_free(zt, sizeof (zio_transform_t));
 303  303          }
 304  304  }
 305  305  
 306  306  /*
 307  307   * ==========================================================================
 308  308   * I/O transform callbacks for subblocks and decompression
 309  309   * ==========================================================================
 310  310   */
 311  311  static void
 312  312  zio_subblock(zio_t *zio, void *data, uint64_t size)
 313  313  {
 314  314          ASSERT(zio->io_size > size);
 315  315  
 316  316          if (zio->io_type == ZIO_TYPE_READ)
 317  317                  bcopy(zio->io_data, data, size);
 318  318  }
 319  319  
 320  320  static void
 321  321  zio_decompress(zio_t *zio, void *data, uint64_t size)
 322  322  {
 323  323          if (zio->io_error == 0 &&
 324  324              zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 325  325              zio->io_data, data, zio->io_size, size) != 0)
 326  326                  zio->io_error = SET_ERROR(EIO);
 327  327  }
 328  328  
 329  329  /*
 330  330   * ==========================================================================
 331  331   * I/O parent/child relationships and pipeline interlocks
 332  332   * ==========================================================================
 333  333   */
 334  334  /*
 335  335   * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 336  336   *        continue calling these functions until they return NULL.
 337  337   *        Otherwise, the next caller will pick up the list walk in
 338  338   *        some indeterminate state.  (Otherwise every caller would
 339  339   *        have to pass in a cookie to keep the state represented by
 340  340   *        io_walk_link, which gets annoying.)
 341  341   */
 342  342  zio_t *
 343  343  zio_walk_parents(zio_t *cio)
 344  344  {
 345  345          zio_link_t *zl = cio->io_walk_link;
 346  346          list_t *pl = &cio->io_parent_list;
 347  347  
 348  348          zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 349  349          cio->io_walk_link = zl;
 350  350  
 351  351          if (zl == NULL)
 352  352                  return (NULL);
 353  353  
 354  354          ASSERT(zl->zl_child == cio);
 355  355          return (zl->zl_parent);
 356  356  }
 357  357  
 358  358  zio_t *
 359  359  zio_walk_children(zio_t *pio)
 360  360  {
 361  361          zio_link_t *zl = pio->io_walk_link;
 362  362          list_t *cl = &pio->io_child_list;
 363  363  
 364  364          zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 365  365          pio->io_walk_link = zl;
 366  366  
 367  367          if (zl == NULL)
 368  368                  return (NULL);
 369  369  
 370  370          ASSERT(zl->zl_parent == pio);
 371  371          return (zl->zl_child);
 372  372  }
 373  373  
 374  374  zio_t *
 375  375  zio_unique_parent(zio_t *cio)
 376  376  {
 377  377          zio_t *pio = zio_walk_parents(cio);
 378  378  
 379  379          VERIFY(zio_walk_parents(cio) == NULL);
 380  380          return (pio);
 381  381  }
 382  382  
 383  383  void
 384  384  zio_add_child(zio_t *pio, zio_t *cio)
 385  385  {
 386  386          zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 387  387  
 388  388          /*
 389  389           * Logical I/Os can have logical, gang, or vdev children.
 390  390           * Gang I/Os can have gang or vdev children.
 391  391           * Vdev I/Os can only have vdev children.
 392  392           * The following ASSERT captures all of these constraints.
 393  393           */
 394  394          ASSERT(cio->io_child_type <= pio->io_child_type);
 395  395  
 396  396          zl->zl_parent = pio;
 397  397          zl->zl_child = cio;
 398  398  
 399  399          mutex_enter(&cio->io_lock);
 400  400          mutex_enter(&pio->io_lock);
 401  401  
 402  402          ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 403  403  
 404  404          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 405  405                  pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 406  406  
 407  407          list_insert_head(&pio->io_child_list, zl);
 408  408          list_insert_head(&cio->io_parent_list, zl);
 409  409  
 410  410          pio->io_child_count++;
 411  411          cio->io_parent_count++;
 412  412  
 413  413          mutex_exit(&pio->io_lock);
 414  414          mutex_exit(&cio->io_lock);
 415  415  }
 416  416  
 417  417  static void
 418  418  zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 419  419  {
 420  420          ASSERT(zl->zl_parent == pio);
 421  421          ASSERT(zl->zl_child == cio);
 422  422  
 423  423          mutex_enter(&cio->io_lock);
 424  424          mutex_enter(&pio->io_lock);
 425  425  
 426  426          list_remove(&pio->io_child_list, zl);
 427  427          list_remove(&cio->io_parent_list, zl);
 428  428  
 429  429          pio->io_child_count--;
 430  430          cio->io_parent_count--;
 431  431  
 432  432          mutex_exit(&pio->io_lock);
 433  433          mutex_exit(&cio->io_lock);
 434  434  
 435  435          kmem_cache_free(zio_link_cache, zl);
 436  436  }
 437  437  
 438  438  static boolean_t
 439  439  zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 440  440  {
 441  441          uint64_t *countp = &zio->io_children[child][wait];
 442  442          boolean_t waiting = B_FALSE;
 443  443  
 444  444          mutex_enter(&zio->io_lock);
 445  445          ASSERT(zio->io_stall == NULL);
 446  446          if (*countp != 0) {
 447  447                  zio->io_stage >>= 1;
 448  448                  zio->io_stall = countp;
 449  449                  waiting = B_TRUE;
 450  450          }
 451  451          mutex_exit(&zio->io_lock);
 452  452  
 453  453          return (waiting);
 454  454  }
 455  455  
 456  456  static void
 457  457  zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 458  458  {
 459  459          uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 460  460          int *errorp = &pio->io_child_error[zio->io_child_type];
 461  461  
 462  462          mutex_enter(&pio->io_lock);
 463  463          if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 464  464                  *errorp = zio_worst_error(*errorp, zio->io_error);
 465  465          pio->io_reexecute |= zio->io_reexecute;
 466  466          ASSERT3U(*countp, >, 0);
 467  467  
 468  468          (*countp)--;
 469  469  
 470  470          if (*countp == 0 && pio->io_stall == countp) {
 471  471                  pio->io_stall = NULL;
 472  472                  mutex_exit(&pio->io_lock);
 473  473                  zio_execute(pio);
 474  474          } else {
 475  475                  mutex_exit(&pio->io_lock);
 476  476          }
 477  477  }
 478  478  
 479  479  static void
 480  480  zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 481  481  {
 482  482          if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 483  483                  zio->io_error = zio->io_child_error[c];
 484  484  }
 485  485  
 486  486  /*
 487  487   * ==========================================================================
 488  488   * Create the various types of I/O (read, write, free, etc)
 489  489   * ==========================================================================
 490  490   */
 491  491  static zio_t *
 492  492  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 493  493      void *data, uint64_t size, zio_done_func_t *done, void *private,
 494  494      zio_type_t type, zio_priority_t priority, enum zio_flag flags,
 495  495      vdev_t *vd, uint64_t offset, const zbookmark_phys_t *zb,
 496  496      enum zio_stage stage, enum zio_stage pipeline)
 497  497  {
 498  498          zio_t *zio;
 499  499  
 500  500          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 501  501          ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 502  502          ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 503  503  
 504  504          ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 505  505          ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 506  506          ASSERT(vd || stage == ZIO_STAGE_OPEN);
 507  507  
 508  508          zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 509  509          bzero(zio, sizeof (zio_t));
 510  510  
 511  511          mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 512  512          cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 513  513  
 514  514          list_create(&zio->io_parent_list, sizeof (zio_link_t),
 515  515              offsetof(zio_link_t, zl_parent_node));
 516  516          list_create(&zio->io_child_list, sizeof (zio_link_t),
 517  517              offsetof(zio_link_t, zl_child_node));
 518  518  
 519  519          if (vd != NULL)
 520  520                  zio->io_child_type = ZIO_CHILD_VDEV;
 521  521          else if (flags & ZIO_FLAG_GANG_CHILD)
 522  522                  zio->io_child_type = ZIO_CHILD_GANG;
 523  523          else if (flags & ZIO_FLAG_DDT_CHILD)
 524  524                  zio->io_child_type = ZIO_CHILD_DDT;
 525  525          else
 526  526                  zio->io_child_type = ZIO_CHILD_LOGICAL;
 527  527  
 528  528          if (bp != NULL) {
 529  529                  zio->io_bp = (blkptr_t *)bp;
 530  530                  zio->io_bp_copy = *bp;
 531  531                  zio->io_bp_orig = *bp;
 532  532                  if (type != ZIO_TYPE_WRITE ||
 533  533                      zio->io_child_type == ZIO_CHILD_DDT)
 534  534                          zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 535  535                  if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 536  536                          zio->io_logical = zio;
 537  537                  if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 538  538                          pipeline |= ZIO_GANG_STAGES;
 539  539          }
 540  540  
 541  541          zio->io_spa = spa;
 542  542          zio->io_txg = txg;
 543  543          zio->io_done = done;
 544  544          zio->io_private = private;
 545  545          zio->io_type = type;
 546  546          zio->io_priority = priority;
 547  547          zio->io_vd = vd;
 548  548          zio->io_offset = offset;
 549  549          zio->io_orig_data = zio->io_data = data;
 550  550          zio->io_orig_size = zio->io_size = size;
 551  551          zio->io_orig_flags = zio->io_flags = flags;
 552  552          zio->io_orig_stage = zio->io_stage = stage;
 553  553          zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 554  554  
 555  555          zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 556  556          zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 557  557  
 558  558          if (zb != NULL)
 559  559                  zio->io_bookmark = *zb;
 560  560  
 561  561          if (pio != NULL) {
 562  562                  zio->io_zoneid = pio->io_zoneid;
 563  563                  if (zio->io_logical == NULL)
 564  564                          zio->io_logical = pio->io_logical;
 565  565                  if (zio->io_child_type == ZIO_CHILD_GANG)
 566  566                          zio->io_gang_leader = pio->io_gang_leader;
 567  567                  zio_add_child(pio, zio);
 568  568          } else {
 569  569                  zfs_zone_zio_init(zio);
 570  570          }
 571  571  
 572  572          return (zio);
 573  573  }
 574  574  
 575  575  static void
 576  576  zio_destroy(zio_t *zio)
 577  577  {
 578  578          list_destroy(&zio->io_parent_list);
 579  579          list_destroy(&zio->io_child_list);
 580  580          mutex_destroy(&zio->io_lock);
 581  581          cv_destroy(&zio->io_cv);
 582  582          kmem_cache_free(zio_cache, zio);
 583  583  }
 584  584  
 585  585  zio_t *
 586  586  zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 587  587      void *private, enum zio_flag flags)
 588  588  {
 589  589          zio_t *zio;
 590  590  
 591  591          zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 592  592              ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 593  593              ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 594  594  
 595  595          return (zio);
 596  596  }
 597  597  
 598  598  zio_t *
 599  599  zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 600  600  {
 601  601          return (zio_null(NULL, spa, NULL, done, private, flags));
 602  602  }
 603  603  
 604  604  void
 605  605  zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp)
 606  606  {
 607  607          if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) {
 608  608                  zfs_panic_recover("blkptr at %p has invalid TYPE %llu",
 609  609                      bp, (longlong_t)BP_GET_TYPE(bp));
 610  610          }
 611  611          if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS ||
 612  612              BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) {
 613  613                  zfs_panic_recover("blkptr at %p has invalid CHECKSUM %llu",
 614  614                      bp, (longlong_t)BP_GET_CHECKSUM(bp));
 615  615          }
 616  616          if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS ||
 617  617              BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) {
 618  618                  zfs_panic_recover("blkptr at %p has invalid COMPRESS %llu",
 619  619                      bp, (longlong_t)BP_GET_COMPRESS(bp));
 620  620          }
 621  621          if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) {
 622  622                  zfs_panic_recover("blkptr at %p has invalid LSIZE %llu",
 623  623                      bp, (longlong_t)BP_GET_LSIZE(bp));
 624  624          }
 625  625          if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) {
 626  626                  zfs_panic_recover("blkptr at %p has invalid PSIZE %llu",
 627  627                      bp, (longlong_t)BP_GET_PSIZE(bp));
 628  628          }
 629  629  
 630  630          if (BP_IS_EMBEDDED(bp)) {
 631  631                  if (BPE_GET_ETYPE(bp) > NUM_BP_EMBEDDED_TYPES) {
 632  632                          zfs_panic_recover("blkptr at %p has invalid ETYPE %llu",
 633  633                              bp, (longlong_t)BPE_GET_ETYPE(bp));
 634  634                  }
 635  635          }
 636  636  
 637  637          /*
 638  638           * Pool-specific checks.
 639  639           *
 640  640           * Note: it would be nice to verify that the blk_birth and
 641  641           * BP_PHYSICAL_BIRTH() are not too large.  However, spa_freeze()
 642  642           * allows the birth time of log blocks (and dmu_sync()-ed blocks
 643  643           * that are in the log) to be arbitrarily large.
 644  644           */
 645  645          for (int i = 0; i < BP_GET_NDVAS(bp); i++) {
 646  646                  uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]);
 647  647                  if (vdevid >= spa->spa_root_vdev->vdev_children) {
 648  648                          zfs_panic_recover("blkptr at %p DVA %u has invalid "
 649  649                              "VDEV %llu",
 650  650                              bp, i, (longlong_t)vdevid);
 651  651                  }
 652  652                  vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid];
 653  653                  if (vd == NULL) {
 654  654                          zfs_panic_recover("blkptr at %p DVA %u has invalid "
 655  655                              "VDEV %llu",
 656  656                              bp, i, (longlong_t)vdevid);
 657  657                  }
 658  658                  if (vd->vdev_ops == &vdev_hole_ops) {
 659  659                          zfs_panic_recover("blkptr at %p DVA %u has hole "
 660  660                              "VDEV %llu",
 661  661                              bp, i, (longlong_t)vdevid);
 662  662  
 663  663                  }
 664  664                  if (vd->vdev_ops == &vdev_missing_ops) {
 665  665                          /*
 666  666                           * "missing" vdevs are valid during import, but we
 667  667                           * don't have their detailed info (e.g. asize), so
 668  668                           * we can't perform any more checks on them.
 669  669                           */
 670  670                          continue;
 671  671                  }
 672  672                  uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]);
 673  673                  uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]);
 674  674                  if (BP_IS_GANG(bp))
 675  675                          asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE);
 676  676                  if (offset + asize > vd->vdev_asize) {
 677  677                          zfs_panic_recover("blkptr at %p DVA %u has invalid "
 678  678                              "OFFSET %llu",
 679  679                              bp, i, (longlong_t)offset);
 680  680                  }
 681  681          }
 682  682  }
 683  683  
 684  684  zio_t *
 685  685  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 686  686      void *data, uint64_t size, zio_done_func_t *done, void *private,
 687  687      zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 688  688  {
 689  689          zio_t *zio;
 690  690  
 691  691          zfs_blkptr_verify(spa, bp);
 692  692  
 693  693          zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 694  694              data, size, done, private,
 695  695              ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 696  696              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 697  697              ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 698  698  
 699  699          return (zio);
 700  700  }
 701  701  
 702  702  zio_t *
 703  703  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 704  704      void *data, uint64_t size, const zio_prop_t *zp,
 705  705      zio_done_func_t *ready, zio_done_func_t *physdone, zio_done_func_t *done,
 706  706      void *private,
 707  707      zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb)
 708  708  {
 709  709          zio_t *zio;
 710  710  
 711  711          ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 712  712              zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 713  713              zp->zp_compress >= ZIO_COMPRESS_OFF &&
 714  714              zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 715  715              DMU_OT_IS_VALID(zp->zp_type) &&
 716  716              zp->zp_level < 32 &&
 717  717              zp->zp_copies > 0 &&
 718  718              zp->zp_copies <= spa_max_replication(spa));
 719  719  
 720  720          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 721  721              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 722  722              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 723  723              ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 724  724  
 725  725          zio->io_ready = ready;
 726  726          zio->io_physdone = physdone;
 727  727          zio->io_prop = *zp;
 728  728  
 729  729          /*
 730  730           * Data can be NULL if we are going to call zio_write_override() to
 731  731           * provide the already-allocated BP.  But we may need the data to
 732  732           * verify a dedup hit (if requested).  In this case, don't try to
 733  733           * dedup (just take the already-allocated BP verbatim).
 734  734           */
 735  735          if (data == NULL && zio->io_prop.zp_dedup_verify) {
 736  736                  zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE;
 737  737          }
 738  738  
 739  739          return (zio);
 740  740  }
 741  741  
 742  742  zio_t *
 743  743  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 744  744      uint64_t size, zio_done_func_t *done, void *private,
 745  745      zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb)
 746  746  {
 747  747          zio_t *zio;
 748  748  
 749  749          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 750  750              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 751  751              ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 752  752  
 753  753          return (zio);
 754  754  }
 755  755  
 756  756  void
 757  757  zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite)
 758  758  {
 759  759          ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 760  760          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 761  761          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 762  762          ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 763  763  
 764  764          /*
 765  765           * We must reset the io_prop to match the values that existed
 766  766           * when the bp was first written by dmu_sync() keeping in mind
 767  767           * that nopwrite and dedup are mutually exclusive.
 768  768           */
 769  769          zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup;
 770  770          zio->io_prop.zp_nopwrite = nopwrite;
 771  771          zio->io_prop.zp_copies = copies;
 772  772          zio->io_bp_override = bp;
 773  773  }
 774  774  
 775  775  void
 776  776  zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 777  777  {
 778  778  
 779  779          /*
 780  780           * The check for EMBEDDED is a performance optimization.  We
 781  781           * process the free here (by ignoring it) rather than
 782  782           * putting it on the list and then processing it in zio_free_sync().
 783  783           */
 784  784          if (BP_IS_EMBEDDED(bp))
 785  785                  return;
 786  786          metaslab_check_free(spa, bp);
 787  787  
 788  788          /*
 789  789           * Frees that are for the currently-syncing txg, are not going to be
 790  790           * deferred, and which will not need to do a read (i.e. not GANG or
 791  791           * DEDUP), can be processed immediately.  Otherwise, put them on the
 792  792           * in-memory list for later processing.
 793  793           */
 794  794          if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp) ||
 795  795              txg != spa->spa_syncing_txg ||
 796  796              spa_sync_pass(spa) >= zfs_sync_pass_deferred_free) {
 797  797                  bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 798  798          } else {
 799  799                  VERIFY0(zio_wait(zio_free_sync(NULL, spa, txg, bp, 0)));
 800  800          }
 801  801  }
 802  802  
 803  803  zio_t *
 804  804  zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 805  805      enum zio_flag flags)
 806  806  {
 807  807          zio_t *zio;
 808  808          enum zio_stage stage = ZIO_FREE_PIPELINE;
 809  809  
 810  810          ASSERT(!BP_IS_HOLE(bp));
 811  811          ASSERT(spa_syncing_txg(spa) == txg);
 812  812          ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
 813  813  
 814  814          if (BP_IS_EMBEDDED(bp))
 815  815                  return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 816  816  
 817  817          metaslab_check_free(spa, bp);
 818  818          arc_freed(spa, bp);
 819  819  
 820  820          /*
 821  821           * GANG and DEDUP blocks can induce a read (for the gang block header,
 822  822           * or the DDT), so issue them asynchronously so that this thread is
 823  823           * not tied up.
 824  824           */
 825  825          if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp))
 826  826                  stage |= ZIO_STAGE_ISSUE_ASYNC;
 827  827  
 828  828          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 829  829              NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, flags,
 830  830              NULL, 0, NULL, ZIO_STAGE_OPEN, stage);
 831  831  
 832  832          return (zio);
 833  833  }
 834  834  
 835  835  zio_t *
 836  836  zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 837  837      zio_done_func_t *done, void *private, enum zio_flag flags)
 838  838  {
 839  839          zio_t *zio;
 840  840  
 841  841          dprintf_bp(bp, "claiming in txg %llu", txg);
 842  842  
 843  843          if (BP_IS_EMBEDDED(bp))
 844  844                  return (zio_null(pio, spa, NULL, NULL, NULL, 0));
 845  845  
 846  846          /*
 847  847           * A claim is an allocation of a specific block.  Claims are needed
 848  848           * to support immediate writes in the intent log.  The issue is that
 849  849           * immediate writes contain committed data, but in a txg that was
 850  850           * *not* committed.  Upon opening the pool after an unclean shutdown,
 851  851           * the intent log claims all blocks that contain immediate write data
 852  852           * so that the SPA knows they're in use.
 853  853           *
 854  854           * All claims *must* be resolved in the first txg -- before the SPA
 855  855           * starts allocating blocks -- so that nothing is allocated twice.
 856  856           * If txg == 0 we just verify that the block is claimable.
 857  857           */
 858  858          ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 859  859          ASSERT(txg == spa_first_txg(spa) || txg == 0);
 860  860          ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 861  861  
 862  862          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 863  863              done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 864  864              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 865  865  
 866  866          return (zio);
 867  867  }
 868  868  
 869  869  zio_t *
 870  870  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 871  871      zio_done_func_t *done, void *private, enum zio_flag flags)
 872  872  {
 873  873          zio_t *zio;
 874  874          int c;
 875  875  
 876  876          if (vd->vdev_children == 0) {
 877  877                  zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 878  878                      ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 879  879                      ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 880  880  
 881  881                  zio->io_cmd = cmd;
 882  882          } else {
 883  883                  zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 884  884  
 885  885                  for (c = 0; c < vd->vdev_children; c++)
 886  886                          zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 887  887                              done, private, flags));
 888  888          }
 889  889  
 890  890          return (zio);
 891  891  }
 892  892  
 893  893  zio_t *
 894  894  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 895  895      void *data, int checksum, zio_done_func_t *done, void *private,
 896  896      zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 897  897  {
 898  898          zio_t *zio;
 899  899  
 900  900          ASSERT(vd->vdev_children == 0);
 901  901          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 902  902              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 903  903          ASSERT3U(offset + size, <=, vd->vdev_psize);
 904  904  
 905  905          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 906  906              ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 907  907              NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 908  908  
 909  909          zio->io_prop.zp_checksum = checksum;
 910  910  
 911  911          return (zio);
 912  912  }
 913  913  
 914  914  zio_t *
 915  915  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 916  916      void *data, int checksum, zio_done_func_t *done, void *private,
 917  917      zio_priority_t priority, enum zio_flag flags, boolean_t labels)
 918  918  {
 919  919          zio_t *zio;
 920  920  
 921  921          ASSERT(vd->vdev_children == 0);
 922  922          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 923  923              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 924  924          ASSERT3U(offset + size, <=, vd->vdev_psize);
 925  925  
 926  926          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 927  927              ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, offset,
 928  928              NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 929  929  
 930  930          zio->io_prop.zp_checksum = checksum;
 931  931  
 932  932          if (zio_checksum_table[checksum].ci_eck) {
 933  933                  /*
 934  934                   * zec checksums are necessarily destructive -- they modify
 935  935                   * the end of the write buffer to hold the verifier/checksum.
 936  936                   * Therefore, we must make a local copy in case the data is
 937  937                   * being written to multiple places in parallel.
 938  938                   */
 939  939                  void *wbuf = zio_buf_alloc(size);
 940  940                  bcopy(data, wbuf, size);
 941  941                  zio_push_transform(zio, wbuf, size, size, NULL);
 942  942          }
 943  943  
 944  944          return (zio);
 945  945  }
 946  946  
 947  947  /*
 948  948   * Create a child I/O to do some work for us.
 949  949   */
 950  950  zio_t *
 951  951  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 952  952          void *data, uint64_t size, int type, zio_priority_t priority,
 953  953          enum zio_flag flags, zio_done_func_t *done, void *private)
 954  954  {
 955  955          enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 956  956          zio_t *zio;
 957  957  
 958  958          ASSERT(vd->vdev_parent ==
 959  959              (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 960  960  
 961  961          if (type == ZIO_TYPE_READ && bp != NULL) {
 962  962                  /*
 963  963                   * If we have the bp, then the child should perform the
 964  964                   * checksum and the parent need not.  This pushes error
 965  965                   * detection as close to the leaves as possible and
 966  966                   * eliminates redundant checksums in the interior nodes.
 967  967                   */
 968  968                  pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 969  969                  pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 970  970          }
 971  971  
 972  972          if (vd->vdev_children == 0)
 973  973                  offset += VDEV_LABEL_START_SIZE;
 974  974  
 975  975          flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 976  976  
 977  977          /*
 978  978           * If we've decided to do a repair, the write is not speculative --
 979  979           * even if the original read was.
 980  980           */
 981  981          if (flags & ZIO_FLAG_IO_REPAIR)
 982  982                  flags &= ~ZIO_FLAG_SPECULATIVE;
 983  983  
 984  984          zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 985  985              done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 986  986              ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 987  987  
 988  988          zio->io_physdone = pio->io_physdone;
 989  989          if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL)
 990  990                  zio->io_logical->io_phys_children++;
 991  991  
 992  992          return (zio);
 993  993  }
 994  994  
 995  995  zio_t *
 996  996  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 997  997          int type, zio_priority_t priority, enum zio_flag flags,
 998  998          zio_done_func_t *done, void *private)
 999  999  {
1000 1000          zio_t *zio;
1001 1001  
1002 1002          ASSERT(vd->vdev_ops->vdev_op_leaf);
1003 1003  
1004 1004          zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
1005 1005              data, size, done, private, type, priority,
1006 1006              flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED,
1007 1007              vd, offset, NULL,
1008 1008              ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
1009 1009  
1010 1010          return (zio);
1011 1011  }
1012 1012  
1013 1013  void
1014 1014  zio_flush(zio_t *zio, vdev_t *vd)
1015 1015  {
1016 1016          zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
1017 1017              NULL, NULL,
1018 1018              ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
1019 1019  }
1020 1020  
1021 1021  void
1022 1022  zio_shrink(zio_t *zio, uint64_t size)
1023 1023  {
1024 1024          ASSERT(zio->io_executor == NULL);
1025 1025          ASSERT(zio->io_orig_size == zio->io_size);
1026 1026          ASSERT(size <= zio->io_size);
1027 1027  
1028 1028          /*
1029 1029           * We don't shrink for raidz because of problems with the
1030 1030           * reconstruction when reading back less than the block size.
1031 1031           * Note, BP_IS_RAIDZ() assumes no compression.
1032 1032           */
1033 1033          ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
1034 1034          if (!BP_IS_RAIDZ(zio->io_bp))
1035 1035                  zio->io_orig_size = zio->io_size = size;
1036 1036  }
1037 1037  
1038 1038  /*
1039 1039   * ==========================================================================
1040 1040   * Prepare to read and write logical blocks
1041 1041   * ==========================================================================
1042 1042   */
1043 1043  
1044 1044  static int
1045 1045  zio_read_bp_init(zio_t *zio)
1046 1046  {
1047 1047          blkptr_t *bp = zio->io_bp;
1048 1048  
1049 1049          if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
1050 1050              zio->io_child_type == ZIO_CHILD_LOGICAL &&
1051 1051              !(zio->io_flags & ZIO_FLAG_RAW)) {
1052 1052                  uint64_t psize =
1053 1053                      BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp);
1054 1054                  void *cbuf = zio_buf_alloc(psize);
1055 1055  
1056 1056                  zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
1057 1057          }
1058 1058  
1059 1059          if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) {
1060 1060                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1061 1061                  decode_embedded_bp_compressed(bp, zio->io_data);
1062 1062          } else {
1063 1063                  ASSERT(!BP_IS_EMBEDDED(bp));
1064 1064          }
1065 1065  
1066 1066          if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
1067 1067                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1068 1068  
1069 1069          if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
1070 1070                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
1071 1071  
1072 1072          if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
1073 1073                  zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
1074 1074  
1075 1075          return (ZIO_PIPELINE_CONTINUE);
1076 1076  }
1077 1077  
1078 1078  static int
1079 1079  zio_write_bp_init(zio_t *zio)
1080 1080  {
1081 1081          spa_t *spa = zio->io_spa;
1082 1082          zio_prop_t *zp = &zio->io_prop;
1083 1083          enum zio_compress compress = zp->zp_compress;
1084 1084          blkptr_t *bp = zio->io_bp;
1085 1085          uint64_t lsize = zio->io_size;
1086 1086          uint64_t psize = lsize;
1087 1087          int pass = 1;
1088 1088  
1089 1089          /*
1090 1090           * If our children haven't all reached the ready stage,
1091 1091           * wait for them and then repeat this pipeline stage.
1092 1092           */
1093 1093          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
1094 1094              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
1095 1095                  return (ZIO_PIPELINE_STOP);
1096 1096  
1097 1097          if (!IO_IS_ALLOCATING(zio))
1098 1098                  return (ZIO_PIPELINE_CONTINUE);
1099 1099  
1100 1100          ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
1101 1101  
1102 1102          if (zio->io_bp_override) {
1103 1103                  ASSERT(bp->blk_birth != zio->io_txg);
1104 1104                  ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
1105 1105  
1106 1106                  *bp = *zio->io_bp_override;
1107 1107                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1108 1108  
1109 1109                  if (BP_IS_EMBEDDED(bp))
1110 1110                          return (ZIO_PIPELINE_CONTINUE);
1111 1111  
1112 1112                  /*
1113 1113                   * If we've been overridden and nopwrite is set then
1114 1114                   * set the flag accordingly to indicate that a nopwrite
1115 1115                   * has already occurred.
1116 1116                   */
1117 1117                  if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) {
1118 1118                          ASSERT(!zp->zp_dedup);
1119 1119                          zio->io_flags |= ZIO_FLAG_NOPWRITE;
1120 1120                          return (ZIO_PIPELINE_CONTINUE);
1121 1121                  }
1122 1122  
1123 1123                  ASSERT(!zp->zp_nopwrite);
1124 1124  
1125 1125                  if (BP_IS_HOLE(bp) || !zp->zp_dedup)

↓ open down ↓

1125 lines elided

↑ open up ↑

1126 1126                          return (ZIO_PIPELINE_CONTINUE);
1127 1127  
1128 1128                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
1129 1129                      zp->zp_dedup_verify);
1130 1130  
1131 1131                  if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
1132 1132                          BP_SET_DEDUP(bp, 1);
1133 1133                          zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
1134 1134                          return (ZIO_PIPELINE_CONTINUE);
1135 1135                  }
     1136 +                zio->io_bp_override = NULL;
     1137 +                BP_ZERO(bp);
1136 1138          }
1137 1139  
1138 1140          if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) {
1139 1141                  /*
1140 1142                   * We're rewriting an existing block, which means we're
1141 1143                   * working on behalf of spa_sync().  For spa_sync() to
1142 1144                   * converge, it must eventually be the case that we don't
1143 1145                   * have to allocate new blocks.  But compression changes
1144 1146                   * the blocksize, which forces a reallocate, and makes
1145 1147                   * convergence take longer.  Therefore, after the first

1146 1148                   * few passes, stop compressing to ensure convergence.
1147 1149                   */
1148 1150                  pass = spa_sync_pass(spa);
1149 1151  
1150 1152                  ASSERT(zio->io_txg == spa_syncing_txg(spa));
1151 1153                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1152 1154                  ASSERT(!BP_GET_DEDUP(bp));
1153 1155  
1154 1156                  if (pass >= zfs_sync_pass_dont_compress)
1155 1157                          compress = ZIO_COMPRESS_OFF;
1156 1158  
1157 1159                  /* Make sure someone doesn't change their mind on overwrites */
1158 1160                  ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp),
1159 1161                      spa_max_replication(spa)) == BP_GET_NDVAS(bp));
1160 1162          }
1161 1163  
1162 1164          if (compress != ZIO_COMPRESS_OFF) {
1163 1165                  void *cbuf = zio_buf_alloc(lsize);
1164 1166                  psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
1165 1167                  if (psize == 0 || psize == lsize) {
1166 1168                          compress = ZIO_COMPRESS_OFF;
1167 1169                          zio_buf_free(cbuf, lsize);
1168 1170                  } else if (!zp->zp_dedup && psize <= BPE_PAYLOAD_SIZE &&
1169 1171                      zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) &&
1170 1172                      spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) {
1171 1173                          encode_embedded_bp_compressed(bp,
1172 1174                              cbuf, compress, lsize, psize);
1173 1175                          BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA);
1174 1176                          BP_SET_TYPE(bp, zio->io_prop.zp_type);
1175 1177                          BP_SET_LEVEL(bp, zio->io_prop.zp_level);
1176 1178                          zio_buf_free(cbuf, lsize);
1177 1179                          bp->blk_birth = zio->io_txg;
1178 1180                          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1179 1181                          ASSERT(spa_feature_is_active(spa,
1180 1182                              SPA_FEATURE_EMBEDDED_DATA));
1181 1183                          return (ZIO_PIPELINE_CONTINUE);
1182 1184                  } else {
1183 1185                          /*
1184 1186                           * Round up compressed size to MINBLOCKSIZE and
1185 1187                           * zero the tail.
1186 1188                           */
1187 1189                          size_t rounded =
1188 1190                              P2ROUNDUP(psize, (size_t)SPA_MINBLOCKSIZE);
1189 1191                          if (rounded > psize) {
1190 1192                                  bzero((char *)cbuf + psize, rounded - psize);
1191 1193                                  psize = rounded;
1192 1194                          }
1193 1195                          if (psize == lsize) {
1194 1196                                  compress = ZIO_COMPRESS_OFF;
1195 1197                                  zio_buf_free(cbuf, lsize);
1196 1198                          } else {
1197 1199                                  zio_push_transform(zio, cbuf,
1198 1200                                      psize, lsize, NULL);
1199 1201                          }
1200 1202                  }
1201 1203          }
1202 1204  
1203 1205          /*
1204 1206           * The final pass of spa_sync() must be all rewrites, but the first
1205 1207           * few passes offer a trade-off: allocating blocks defers convergence,
1206 1208           * but newly allocated blocks are sequential, so they can be written
1207 1209           * to disk faster.  Therefore, we allow the first few passes of
1208 1210           * spa_sync() to allocate new blocks, but force rewrites after that.
1209 1211           * There should only be a handful of blocks after pass 1 in any case.
1210 1212           */
1211 1213          if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg &&
1212 1214              BP_GET_PSIZE(bp) == psize &&
1213 1215              pass >= zfs_sync_pass_rewrite) {
1214 1216                  ASSERT(psize != 0);
1215 1217                  enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1216 1218                  zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1217 1219                  zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1218 1220          } else {
1219 1221                  BP_ZERO(bp);
1220 1222                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
1221 1223          }
1222 1224  
1223 1225          if (psize == 0) {
1224 1226                  if (zio->io_bp_orig.blk_birth != 0 &&
1225 1227                      spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) {
1226 1228                          BP_SET_LSIZE(bp, lsize);
1227 1229                          BP_SET_TYPE(bp, zp->zp_type);
1228 1230                          BP_SET_LEVEL(bp, zp->zp_level);
1229 1231                          BP_SET_BIRTH(bp, zio->io_txg, 0);
1230 1232                  }
1231 1233                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1232 1234          } else {
1233 1235                  ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1234 1236                  BP_SET_LSIZE(bp, lsize);
1235 1237                  BP_SET_TYPE(bp, zp->zp_type);
1236 1238                  BP_SET_LEVEL(bp, zp->zp_level);
1237 1239                  BP_SET_PSIZE(bp, psize);
1238 1240                  BP_SET_COMPRESS(bp, compress);
1239 1241                  BP_SET_CHECKSUM(bp, zp->zp_checksum);
1240 1242                  BP_SET_DEDUP(bp, zp->zp_dedup);
1241 1243                  BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1242 1244                  if (zp->zp_dedup) {
1243 1245                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1244 1246                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1245 1247                          zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1246 1248                  }
1247 1249                  if (zp->zp_nopwrite) {
1248 1250                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1249 1251                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1250 1252                          zio->io_pipeline |= ZIO_STAGE_NOP_WRITE;
1251 1253                  }
1252 1254          }
1253 1255  
1254 1256          return (ZIO_PIPELINE_CONTINUE);
1255 1257  }
1256 1258  
1257 1259  static int
1258 1260  zio_free_bp_init(zio_t *zio)
1259 1261  {
1260 1262          blkptr_t *bp = zio->io_bp;
1261 1263  
1262 1264          if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1263 1265                  if (BP_GET_DEDUP(bp))
1264 1266                          zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1265 1267          }
1266 1268  
1267 1269          return (ZIO_PIPELINE_CONTINUE);
1268 1270  }
1269 1271  
1270 1272  /*
1271 1273   * ==========================================================================
1272 1274   * Execute the I/O pipeline
1273 1275   * ==========================================================================
1274 1276   */
1275 1277  
1276 1278  static void
1277 1279  zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline)
1278 1280  {
1279 1281          spa_t *spa = zio->io_spa;
1280 1282          zio_type_t t = zio->io_type;
1281 1283          int flags = (cutinline ? TQ_FRONT : 0);
1282 1284  
1283 1285          /*
1284 1286           * If we're a config writer or a probe, the normal issue and
1285 1287           * interrupt threads may all be blocked waiting for the config lock.
1286 1288           * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1287 1289           */
1288 1290          if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1289 1291                  t = ZIO_TYPE_NULL;
1290 1292  
1291 1293          /*
1292 1294           * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1293 1295           */
1294 1296          if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1295 1297                  t = ZIO_TYPE_NULL;
1296 1298  
1297 1299          /*
1298 1300           * If this is a high priority I/O, then use the high priority taskq if
1299 1301           * available.
1300 1302           */
1301 1303          if (zio->io_priority == ZIO_PRIORITY_NOW &&
1302 1304              spa->spa_zio_taskq[t][q + 1].stqs_count != 0)
1303 1305                  q++;
1304 1306  
1305 1307          ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1306 1308  
1307 1309          /*
1308 1310           * NB: We are assuming that the zio can only be dispatched
1309 1311           * to a single taskq at a time.  It would be a grievous error
1310 1312           * to dispatch the zio to another taskq at the same time.
1311 1313           */
1312 1314          ASSERT(zio->io_tqent.tqent_next == NULL);
1313 1315          spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio,
1314 1316              flags, &zio->io_tqent);
1315 1317  }
1316 1318  
1317 1319  static boolean_t
1318 1320  zio_taskq_member(zio_t *zio, zio_taskq_type_t q)
1319 1321  {
1320 1322          kthread_t *executor = zio->io_executor;
1321 1323          spa_t *spa = zio->io_spa;
1322 1324  
1323 1325          for (zio_type_t t = 0; t < ZIO_TYPES; t++) {
1324 1326                  spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
1325 1327                  uint_t i;
1326 1328                  for (i = 0; i < tqs->stqs_count; i++) {
1327 1329                          if (taskq_member(tqs->stqs_taskq[i], executor))
1328 1330                                  return (B_TRUE);
1329 1331                  }
1330 1332          }
1331 1333  
1332 1334          return (B_FALSE);
1333 1335  }
1334 1336  
1335 1337  static int
1336 1338  zio_issue_async(zio_t *zio)
1337 1339  {
1338 1340          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1339 1341  
1340 1342          return (ZIO_PIPELINE_STOP);
1341 1343  }
1342 1344  
1343 1345  void
1344 1346  zio_interrupt(zio_t *zio)
1345 1347  {
1346 1348          zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1347 1349  }
1348 1350  
1349 1351  /*
1350 1352   * Execute the I/O pipeline until one of the following occurs:
1351 1353   *
1352 1354   *      (1) the I/O completes
1353 1355   *      (2) the pipeline stalls waiting for dependent child I/Os
1354 1356   *      (3) the I/O issues, so we're waiting for an I/O completion interrupt
1355 1357   *      (4) the I/O is delegated by vdev-level caching or aggregation
1356 1358   *      (5) the I/O is deferred due to vdev-level queueing
1357 1359   *      (6) the I/O is handed off to another thread.
1358 1360   *
1359 1361   * In all cases, the pipeline stops whenever there's no CPU work; it never
1360 1362   * burns a thread in cv_wait().
1361 1363   *
1362 1364   * There's no locking on io_stage because there's no legitimate way
1363 1365   * for multiple threads to be attempting to process the same I/O.
1364 1366   */
1365 1367  static zio_pipe_stage_t *zio_pipeline[];
1366 1368  
1367 1369  void
1368 1370  zio_execute(zio_t *zio)
1369 1371  {
1370 1372          zio->io_executor = curthread;
1371 1373  
1372 1374          while (zio->io_stage < ZIO_STAGE_DONE) {
1373 1375                  enum zio_stage pipeline = zio->io_pipeline;
1374 1376                  enum zio_stage stage = zio->io_stage;
1375 1377                  int rv;
1376 1378  
1377 1379                  ASSERT(!MUTEX_HELD(&zio->io_lock));
1378 1380                  ASSERT(ISP2(stage));
1379 1381                  ASSERT(zio->io_stall == NULL);
1380 1382  
1381 1383                  do {
1382 1384                          stage <<= 1;
1383 1385                  } while ((stage & pipeline) == 0);
1384 1386  
1385 1387                  ASSERT(stage <= ZIO_STAGE_DONE);
1386 1388  
1387 1389                  /*
1388 1390                   * If we are in interrupt context and this pipeline stage
1389 1391                   * will grab a config lock that is held across I/O,
1390 1392                   * or may wait for an I/O that needs an interrupt thread
1391 1393                   * to complete, issue async to avoid deadlock.
1392 1394                   *
1393 1395                   * For VDEV_IO_START, we cut in line so that the io will
1394 1396                   * be sent to disk promptly.
1395 1397                   */
1396 1398                  if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1397 1399                      zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1398 1400                          boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1399 1401                              zio_requeue_io_start_cut_in_line : B_FALSE;
1400 1402                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1401 1403                          return;
1402 1404                  }
1403 1405  
1404 1406                  zio->io_stage = stage;
1405 1407                  rv = zio_pipeline[highbit64(stage) - 1](zio);
1406 1408  
1407 1409                  if (rv == ZIO_PIPELINE_STOP)
1408 1410                          return;
1409 1411  
1410 1412                  ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1411 1413          }
1412 1414  }
1413 1415  
1414 1416  /*
1415 1417   * ==========================================================================
1416 1418   * Initiate I/O, either sync or async
1417 1419   * ==========================================================================
1418 1420   */
1419 1421  int
1420 1422  zio_wait(zio_t *zio)
1421 1423  {
1422 1424          int error;
1423 1425  
1424 1426          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1425 1427          ASSERT(zio->io_executor == NULL);
1426 1428  
1427 1429          zio->io_waiter = curthread;
1428 1430  
1429 1431          zio_execute(zio);
1430 1432  
1431 1433          mutex_enter(&zio->io_lock);
1432 1434          while (zio->io_executor != NULL)
1433 1435                  cv_wait(&zio->io_cv, &zio->io_lock);
1434 1436          mutex_exit(&zio->io_lock);
1435 1437  
1436 1438          error = zio->io_error;
1437 1439          zio_destroy(zio);
1438 1440  
1439 1441          return (error);
1440 1442  }
1441 1443  
1442 1444  void
1443 1445  zio_nowait(zio_t *zio)
1444 1446  {
1445 1447          ASSERT(zio->io_executor == NULL);
1446 1448  
1447 1449          if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1448 1450              zio_unique_parent(zio) == NULL) {
1449 1451                  /*
1450 1452                   * This is a logical async I/O with no parent to wait for it.
1451 1453                   * We add it to the spa_async_root_zio "Godfather" I/O which
1452 1454                   * will ensure they complete prior to unloading the pool.
1453 1455                   */
1454 1456                  spa_t *spa = zio->io_spa;
1455 1457  
1456 1458                  zio_add_child(spa->spa_async_zio_root[CPU_SEQID], zio);
1457 1459          }
1458 1460  
1459 1461          zio_execute(zio);
1460 1462  }
1461 1463  
1462 1464  /*
1463 1465   * ==========================================================================
1464 1466   * Reexecute or suspend/resume failed I/O
1465 1467   * ==========================================================================
1466 1468   */
1467 1469  
1468 1470  static void
1469 1471  zio_reexecute(zio_t *pio)
1470 1472  {
1471 1473          zio_t *cio, *cio_next;
1472 1474  
1473 1475          ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1474 1476          ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1475 1477          ASSERT(pio->io_gang_leader == NULL);
1476 1478          ASSERT(pio->io_gang_tree == NULL);
1477 1479  
1478 1480          pio->io_flags = pio->io_orig_flags;
1479 1481          pio->io_stage = pio->io_orig_stage;
1480 1482          pio->io_pipeline = pio->io_orig_pipeline;
1481 1483          pio->io_reexecute = 0;
1482 1484          pio->io_flags |= ZIO_FLAG_REEXECUTED;
1483 1485          pio->io_error = 0;
1484 1486          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1485 1487                  pio->io_state[w] = 0;
1486 1488          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1487 1489                  pio->io_child_error[c] = 0;
1488 1490  
1489 1491          if (IO_IS_ALLOCATING(pio))
1490 1492                  BP_ZERO(pio->io_bp);
1491 1493  
1492 1494          /*
1493 1495           * As we reexecute pio's children, new children could be created.
1494 1496           * New children go to the head of pio's io_child_list, however,
1495 1497           * so we will (correctly) not reexecute them.  The key is that
1496 1498           * the remainder of pio's io_child_list, from 'cio_next' onward,
1497 1499           * cannot be affected by any side effects of reexecuting 'cio'.
1498 1500           */
1499 1501          for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1500 1502                  cio_next = zio_walk_children(pio);
1501 1503                  mutex_enter(&pio->io_lock);
1502 1504                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1503 1505                          pio->io_children[cio->io_child_type][w]++;
1504 1506                  mutex_exit(&pio->io_lock);
1505 1507                  zio_reexecute(cio);
1506 1508          }
1507 1509  
1508 1510          /*
1509 1511           * Now that all children have been reexecuted, execute the parent.
1510 1512           * We don't reexecute "The Godfather" I/O here as it's the
1511 1513           * responsibility of the caller to wait on him.
1512 1514           */
1513 1515          if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1514 1516                  zio_execute(pio);
1515 1517  }
1516 1518  
1517 1519  void
1518 1520  zio_suspend(spa_t *spa, zio_t *zio)
1519 1521  {
1520 1522          if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1521 1523                  fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1522 1524                      "failure and the failure mode property for this pool "
1523 1525                      "is set to panic.", spa_name(spa));
1524 1526  
1525 1527          zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1526 1528  
1527 1529          mutex_enter(&spa->spa_suspend_lock);
1528 1530  
1529 1531          if (spa->spa_suspend_zio_root == NULL)
1530 1532                  spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1531 1533                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1532 1534                      ZIO_FLAG_GODFATHER);
1533 1535  
1534 1536          spa->spa_suspended = B_TRUE;
1535 1537  
1536 1538          if (zio != NULL) {
1537 1539                  ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1538 1540                  ASSERT(zio != spa->spa_suspend_zio_root);
1539 1541                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1540 1542                  ASSERT(zio_unique_parent(zio) == NULL);
1541 1543                  ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1542 1544                  zio_add_child(spa->spa_suspend_zio_root, zio);
1543 1545          }
1544 1546  
1545 1547          mutex_exit(&spa->spa_suspend_lock);
1546 1548  }
1547 1549  
1548 1550  int
1549 1551  zio_resume(spa_t *spa)
1550 1552  {
1551 1553          zio_t *pio;
1552 1554  
1553 1555          /*
1554 1556           * Reexecute all previously suspended i/o.
1555 1557           */
1556 1558          mutex_enter(&spa->spa_suspend_lock);
1557 1559          spa->spa_suspended = B_FALSE;
1558 1560          cv_broadcast(&spa->spa_suspend_cv);
1559 1561          pio = spa->spa_suspend_zio_root;
1560 1562          spa->spa_suspend_zio_root = NULL;
1561 1563          mutex_exit(&spa->spa_suspend_lock);
1562 1564  
1563 1565          if (pio == NULL)
1564 1566                  return (0);
1565 1567  
1566 1568          zio_reexecute(pio);
1567 1569          return (zio_wait(pio));
1568 1570  }
1569 1571  
1570 1572  void
1571 1573  zio_resume_wait(spa_t *spa)
1572 1574  {
1573 1575          mutex_enter(&spa->spa_suspend_lock);
1574 1576          while (spa_suspended(spa))
1575 1577                  cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1576 1578          mutex_exit(&spa->spa_suspend_lock);
1577 1579  }
1578 1580  
1579 1581  /*
1580 1582   * ==========================================================================
1581 1583   * Gang blocks.
1582 1584   *
1583 1585   * A gang block is a collection of small blocks that looks to the DMU
1584 1586   * like one large block.  When zio_dva_allocate() cannot find a block
1585 1587   * of the requested size, due to either severe fragmentation or the pool
1586 1588   * being nearly full, it calls zio_write_gang_block() to construct the
1587 1589   * block from smaller fragments.
1588 1590   *
1589 1591   * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1590 1592   * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1591 1593   * an indirect block: it's an array of block pointers.  It consumes
1592 1594   * only one sector and hence is allocatable regardless of fragmentation.
1593 1595   * The gang header's bps point to its gang members, which hold the data.
1594 1596   *
1595 1597   * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1596 1598   * as the verifier to ensure uniqueness of the SHA256 checksum.
1597 1599   * Critically, the gang block bp's blk_cksum is the checksum of the data,
1598 1600   * not the gang header.  This ensures that data block signatures (needed for
1599 1601   * deduplication) are independent of how the block is physically stored.
1600 1602   *
1601 1603   * Gang blocks can be nested: a gang member may itself be a gang block.
1602 1604   * Thus every gang block is a tree in which root and all interior nodes are
1603 1605   * gang headers, and the leaves are normal blocks that contain user data.
1604 1606   * The root of the gang tree is called the gang leader.
1605 1607   *
1606 1608   * To perform any operation (read, rewrite, free, claim) on a gang block,
1607 1609   * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1608 1610   * in the io_gang_tree field of the original logical i/o by recursively
1609 1611   * reading the gang leader and all gang headers below it.  This yields
1610 1612   * an in-core tree containing the contents of every gang header and the
1611 1613   * bps for every constituent of the gang block.
1612 1614   *
1613 1615   * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1614 1616   * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1615 1617   * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1616 1618   * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1617 1619   * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1618 1620   * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1619 1621   * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1620 1622   * of the gang header plus zio_checksum_compute() of the data to update the
1621 1623   * gang header's blk_cksum as described above.
1622 1624   *
1623 1625   * The two-phase assemble/issue model solves the problem of partial failure --
1624 1626   * what if you'd freed part of a gang block but then couldn't read the
1625 1627   * gang header for another part?  Assembling the entire gang tree first
1626 1628   * ensures that all the necessary gang header I/O has succeeded before
1627 1629   * starting the actual work of free, claim, or write.  Once the gang tree
1628 1630   * is assembled, free and claim are in-memory operations that cannot fail.
1629 1631   *
1630 1632   * In the event that a gang write fails, zio_dva_unallocate() walks the
1631 1633   * gang tree to immediately free (i.e. insert back into the space map)
1632 1634   * everything we've allocated.  This ensures that we don't get ENOSPC
1633 1635   * errors during repeated suspend/resume cycles due to a flaky device.
1634 1636   *
1635 1637   * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1636 1638   * the gang tree, we won't modify the block, so we can safely defer the free
1637 1639   * (knowing that the block is still intact).  If we *can* assemble the gang
1638 1640   * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1639 1641   * each constituent bp and we can allocate a new block on the next sync pass.
1640 1642   *
1641 1643   * In all cases, the gang tree allows complete recovery from partial failure.
1642 1644   * ==========================================================================
1643 1645   */
1644 1646  
1645 1647  static zio_t *
1646 1648  zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1647 1649  {
1648 1650          if (gn != NULL)
1649 1651                  return (pio);
1650 1652  
1651 1653          return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1652 1654              NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1653 1655              &pio->io_bookmark));
1654 1656  }
1655 1657  
1656 1658  zio_t *
1657 1659  zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1658 1660  {
1659 1661          zio_t *zio;
1660 1662  
1661 1663          if (gn != NULL) {
1662 1664                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1663 1665                      gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1664 1666                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1665 1667                  /*
1666 1668                   * As we rewrite each gang header, the pipeline will compute
1667 1669                   * a new gang block header checksum for it; but no one will
1668 1670                   * compute a new data checksum, so we do that here.  The one
1669 1671                   * exception is the gang leader: the pipeline already computed
1670 1672                   * its data checksum because that stage precedes gang assembly.
1671 1673                   * (Presently, nothing actually uses interior data checksums;
1672 1674                   * this is just good hygiene.)
1673 1675                   */
1674 1676                  if (gn != pio->io_gang_leader->io_gang_tree) {
1675 1677                          zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1676 1678                              data, BP_GET_PSIZE(bp));
1677 1679                  }
1678 1680                  /*
1679 1681                   * If we are here to damage data for testing purposes,
1680 1682                   * leave the GBH alone so that we can detect the damage.
1681 1683                   */
1682 1684                  if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1683 1685                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1684 1686          } else {
1685 1687                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1686 1688                      data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1687 1689                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1688 1690          }
1689 1691  
1690 1692          return (zio);
1691 1693  }
1692 1694  
1693 1695  /* ARGSUSED */
1694 1696  zio_t *
1695 1697  zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1696 1698  {
1697 1699          return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1698 1700              ZIO_GANG_CHILD_FLAGS(pio)));
1699 1701  }
1700 1702  
1701 1703  /* ARGSUSED */
1702 1704  zio_t *
1703 1705  zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1704 1706  {
1705 1707          return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1706 1708              NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1707 1709  }
1708 1710  
1709 1711  static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1710 1712          NULL,
1711 1713          zio_read_gang,
1712 1714          zio_rewrite_gang,
1713 1715          zio_free_gang,
1714 1716          zio_claim_gang,
1715 1717          NULL
1716 1718  };
1717 1719  
1718 1720  static void zio_gang_tree_assemble_done(zio_t *zio);
1719 1721  
1720 1722  static zio_gang_node_t *
1721 1723  zio_gang_node_alloc(zio_gang_node_t **gnpp)
1722 1724  {
1723 1725          zio_gang_node_t *gn;
1724 1726  
1725 1727          ASSERT(*gnpp == NULL);
1726 1728  
1727 1729          gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1728 1730          gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1729 1731          *gnpp = gn;
1730 1732  
1731 1733          return (gn);
1732 1734  }
1733 1735  
1734 1736  static void
1735 1737  zio_gang_node_free(zio_gang_node_t **gnpp)
1736 1738  {
1737 1739          zio_gang_node_t *gn = *gnpp;
1738 1740  
1739 1741          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1740 1742                  ASSERT(gn->gn_child[g] == NULL);
1741 1743  
1742 1744          zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1743 1745          kmem_free(gn, sizeof (*gn));
1744 1746          *gnpp = NULL;
1745 1747  }
1746 1748  
1747 1749  static void
1748 1750  zio_gang_tree_free(zio_gang_node_t **gnpp)
1749 1751  {
1750 1752          zio_gang_node_t *gn = *gnpp;
1751 1753  
1752 1754          if (gn == NULL)
1753 1755                  return;
1754 1756  
1755 1757          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1756 1758                  zio_gang_tree_free(&gn->gn_child[g]);
1757 1759  
1758 1760          zio_gang_node_free(gnpp);
1759 1761  }
1760 1762  
1761 1763  static void
1762 1764  zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1763 1765  {
1764 1766          zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1765 1767  
1766 1768          ASSERT(gio->io_gang_leader == gio);
1767 1769          ASSERT(BP_IS_GANG(bp));
1768 1770  
1769 1771          zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1770 1772              SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1771 1773              gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1772 1774  }
1773 1775  
1774 1776  static void
1775 1777  zio_gang_tree_assemble_done(zio_t *zio)
1776 1778  {
1777 1779          zio_t *gio = zio->io_gang_leader;
1778 1780          zio_gang_node_t *gn = zio->io_private;
1779 1781          blkptr_t *bp = zio->io_bp;
1780 1782  
1781 1783          ASSERT(gio == zio_unique_parent(zio));
1782 1784          ASSERT(zio->io_child_count == 0);
1783 1785  
1784 1786          if (zio->io_error)
1785 1787                  return;
1786 1788  
1787 1789          if (BP_SHOULD_BYTESWAP(bp))
1788 1790                  byteswap_uint64_array(zio->io_data, zio->io_size);
1789 1791  
1790 1792          ASSERT(zio->io_data == gn->gn_gbh);
1791 1793          ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1792 1794          ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1793 1795  
1794 1796          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1795 1797                  blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1796 1798                  if (!BP_IS_GANG(gbp))
1797 1799                          continue;
1798 1800                  zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1799 1801          }
1800 1802  }
1801 1803  
1802 1804  static void
1803 1805  zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1804 1806  {
1805 1807          zio_t *gio = pio->io_gang_leader;
1806 1808          zio_t *zio;
1807 1809  
1808 1810          ASSERT(BP_IS_GANG(bp) == !!gn);
1809 1811          ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1810 1812          ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1811 1813  
1812 1814          /*
1813 1815           * If you're a gang header, your data is in gn->gn_gbh.
1814 1816           * If you're a gang member, your data is in 'data' and gn == NULL.
1815 1817           */
1816 1818          zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1817 1819  
1818 1820          if (gn != NULL) {
1819 1821                  ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1820 1822  
1821 1823                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1822 1824                          blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1823 1825                          if (BP_IS_HOLE(gbp))
1824 1826                                  continue;
1825 1827                          zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1826 1828                          data = (char *)data + BP_GET_PSIZE(gbp);
1827 1829                  }
1828 1830          }
1829 1831  
1830 1832          if (gn == gio->io_gang_tree)
1831 1833                  ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1832 1834  
1833 1835          if (zio != pio)
1834 1836                  zio_nowait(zio);
1835 1837  }
1836 1838  
1837 1839  static int
1838 1840  zio_gang_assemble(zio_t *zio)
1839 1841  {
1840 1842          blkptr_t *bp = zio->io_bp;
1841 1843  
1842 1844          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1843 1845          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1844 1846  
1845 1847          zio->io_gang_leader = zio;
1846 1848  
1847 1849          zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1848 1850  
1849 1851          return (ZIO_PIPELINE_CONTINUE);
1850 1852  }
1851 1853  
1852 1854  static int
1853 1855  zio_gang_issue(zio_t *zio)
1854 1856  {
1855 1857          blkptr_t *bp = zio->io_bp;
1856 1858  
1857 1859          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1858 1860                  return (ZIO_PIPELINE_STOP);
1859 1861  
1860 1862          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1861 1863          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1862 1864  
1863 1865          if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1864 1866                  zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1865 1867          else
1866 1868                  zio_gang_tree_free(&zio->io_gang_tree);
1867 1869  
1868 1870          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1869 1871  
1870 1872          return (ZIO_PIPELINE_CONTINUE);
1871 1873  }
1872 1874  
1873 1875  static void
1874 1876  zio_write_gang_member_ready(zio_t *zio)
1875 1877  {
1876 1878          zio_t *pio = zio_unique_parent(zio);
1877 1879          zio_t *gio = zio->io_gang_leader;
1878 1880          dva_t *cdva = zio->io_bp->blk_dva;
1879 1881          dva_t *pdva = pio->io_bp->blk_dva;
1880 1882          uint64_t asize;
1881 1883  
1882 1884          if (BP_IS_HOLE(zio->io_bp))
1883 1885                  return;
1884 1886  
1885 1887          ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1886 1888  
1887 1889          ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1888 1890          ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1889 1891          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1890 1892          ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1891 1893          ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1892 1894  
1893 1895          mutex_enter(&pio->io_lock);
1894 1896          for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1895 1897                  ASSERT(DVA_GET_GANG(&pdva[d]));
1896 1898                  asize = DVA_GET_ASIZE(&pdva[d]);
1897 1899                  asize += DVA_GET_ASIZE(&cdva[d]);
1898 1900                  DVA_SET_ASIZE(&pdva[d], asize);
1899 1901          }
1900 1902          mutex_exit(&pio->io_lock);
1901 1903  }
1902 1904  
1903 1905  static int
1904 1906  zio_write_gang_block(zio_t *pio)
1905 1907  {
1906 1908          spa_t *spa = pio->io_spa;
1907 1909          blkptr_t *bp = pio->io_bp;
1908 1910          zio_t *gio = pio->io_gang_leader;
1909 1911          zio_t *zio;
1910 1912          zio_gang_node_t *gn, **gnpp;
1911 1913          zio_gbh_phys_t *gbh;
1912 1914          uint64_t txg = pio->io_txg;
1913 1915          uint64_t resid = pio->io_size;
1914 1916          uint64_t lsize;
1915 1917          int copies = gio->io_prop.zp_copies;
1916 1918          int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1917 1919          zio_prop_t zp;
1918 1920          int error;
1919 1921  
1920 1922          error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1921 1923              bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1922 1924              METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1923 1925          if (error) {
1924 1926                  pio->io_error = error;
1925 1927                  return (ZIO_PIPELINE_CONTINUE);
1926 1928          }
1927 1929  
1928 1930          if (pio == gio) {
1929 1931                  gnpp = &gio->io_gang_tree;
1930 1932          } else {
1931 1933                  gnpp = pio->io_private;
1932 1934                  ASSERT(pio->io_ready == zio_write_gang_member_ready);
1933 1935          }
1934 1936  
1935 1937          gn = zio_gang_node_alloc(gnpp);
1936 1938          gbh = gn->gn_gbh;
1937 1939          bzero(gbh, SPA_GANGBLOCKSIZE);
1938 1940  
1939 1941          /*
1940 1942           * Create the gang header.
1941 1943           */
1942 1944          zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1943 1945              pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1944 1946  
1945 1947          /*
1946 1948           * Create and nowait the gang children.
1947 1949           */
1948 1950          for (int g = 0; resid != 0; resid -= lsize, g++) {
1949 1951                  lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1950 1952                      SPA_MINBLOCKSIZE);
1951 1953                  ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1952 1954  
1953 1955                  zp.zp_checksum = gio->io_prop.zp_checksum;
1954 1956                  zp.zp_compress = ZIO_COMPRESS_OFF;
1955 1957                  zp.zp_type = DMU_OT_NONE;
1956 1958                  zp.zp_level = 0;
1957 1959                  zp.zp_copies = gio->io_prop.zp_copies;
1958 1960                  zp.zp_dedup = B_FALSE;
1959 1961                  zp.zp_dedup_verify = B_FALSE;
1960 1962                  zp.zp_nopwrite = B_FALSE;
1961 1963  
1962 1964                  zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1963 1965                      (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1964 1966                      zio_write_gang_member_ready, NULL, NULL, &gn->gn_child[g],
1965 1967                      pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1966 1968                      &pio->io_bookmark));
1967 1969          }
1968 1970  
1969 1971          /*
1970 1972           * Set pio's pipeline to just wait for zio to finish.
1971 1973           */
1972 1974          pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1973 1975  
1974 1976          zio_nowait(zio);
1975 1977  
1976 1978          return (ZIO_PIPELINE_CONTINUE);
1977 1979  }
1978 1980  
1979 1981  /*
1980 1982   * The zio_nop_write stage in the pipeline determines if allocating
1981 1983   * a new bp is necessary.  By leveraging a cryptographically secure checksum,
1982 1984   * such as SHA256, we can compare the checksums of the new data and the old
1983 1985   * to determine if allocating a new block is required.  The nopwrite
1984 1986   * feature can handle writes in either syncing or open context (i.e. zil
1985 1987   * writes) and as a result is mutually exclusive with dedup.
1986 1988   */
1987 1989  static int
1988 1990  zio_nop_write(zio_t *zio)
1989 1991  {
1990 1992          blkptr_t *bp = zio->io_bp;
1991 1993          blkptr_t *bp_orig = &zio->io_bp_orig;
1992 1994          zio_prop_t *zp = &zio->io_prop;
1993 1995  
1994 1996          ASSERT(BP_GET_LEVEL(bp) == 0);
1995 1997          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1996 1998          ASSERT(zp->zp_nopwrite);
1997 1999          ASSERT(!zp->zp_dedup);
1998 2000          ASSERT(zio->io_bp_override == NULL);
1999 2001          ASSERT(IO_IS_ALLOCATING(zio));
2000 2002  
2001 2003          /*
2002 2004           * Check to see if the original bp and the new bp have matching
2003 2005           * characteristics (i.e. same checksum, compression algorithms, etc).
2004 2006           * If they don't then just continue with the pipeline which will
2005 2007           * allocate a new bp.
2006 2008           */
2007 2009          if (BP_IS_HOLE(bp_orig) ||
2008 2010              !zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_dedup ||
2009 2011              BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) ||
2010 2012              BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) ||
2011 2013              BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) ||
2012 2014              zp->zp_copies != BP_GET_NDVAS(bp_orig))
2013 2015                  return (ZIO_PIPELINE_CONTINUE);
2014 2016  
2015 2017          /*
2016 2018           * If the checksums match then reset the pipeline so that we
2017 2019           * avoid allocating a new bp and issuing any I/O.
2018 2020           */
2019 2021          if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) {
2020 2022                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup);
2021 2023                  ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig));
2022 2024                  ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig));
2023 2025                  ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF);
2024 2026                  ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop,
2025 2027                      sizeof (uint64_t)) == 0);
2026 2028  
2027 2029                  *bp = *bp_orig;
2028 2030                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2029 2031                  zio->io_flags |= ZIO_FLAG_NOPWRITE;
2030 2032          }
2031 2033  
2032 2034          return (ZIO_PIPELINE_CONTINUE);
2033 2035  }
2034 2036  
2035 2037  /*
2036 2038   * ==========================================================================
2037 2039   * Dedup
2038 2040   * ==========================================================================
2039 2041   */
2040 2042  static void
2041 2043  zio_ddt_child_read_done(zio_t *zio)
2042 2044  {
2043 2045          blkptr_t *bp = zio->io_bp;
2044 2046          ddt_entry_t *dde = zio->io_private;
2045 2047          ddt_phys_t *ddp;
2046 2048          zio_t *pio = zio_unique_parent(zio);
2047 2049  
2048 2050          mutex_enter(&pio->io_lock);
2049 2051          ddp = ddt_phys_select(dde, bp);
2050 2052          if (zio->io_error == 0)
2051 2053                  ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
2052 2054          if (zio->io_error == 0 && dde->dde_repair_data == NULL)
2053 2055                  dde->dde_repair_data = zio->io_data;
2054 2056          else
2055 2057                  zio_buf_free(zio->io_data, zio->io_size);
2056 2058          mutex_exit(&pio->io_lock);
2057 2059  }
2058 2060  
2059 2061  static int
2060 2062  zio_ddt_read_start(zio_t *zio)
2061 2063  {
2062 2064          blkptr_t *bp = zio->io_bp;
2063 2065  
2064 2066          ASSERT(BP_GET_DEDUP(bp));
2065 2067          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2066 2068          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2067 2069  
2068 2070          if (zio->io_child_error[ZIO_CHILD_DDT]) {
2069 2071                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
2070 2072                  ddt_entry_t *dde = ddt_repair_start(ddt, bp);
2071 2073                  ddt_phys_t *ddp = dde->dde_phys;
2072 2074                  ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
2073 2075                  blkptr_t blk;
2074 2076  
2075 2077                  ASSERT(zio->io_vsd == NULL);
2076 2078                  zio->io_vsd = dde;
2077 2079  
2078 2080                  if (ddp_self == NULL)
2079 2081                          return (ZIO_PIPELINE_CONTINUE);
2080 2082  
2081 2083                  for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
2082 2084                          if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
2083 2085                                  continue;
2084 2086                          ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
2085 2087                              &blk);
2086 2088                          zio_nowait(zio_read(zio, zio->io_spa, &blk,
2087 2089                              zio_buf_alloc(zio->io_size), zio->io_size,
2088 2090                              zio_ddt_child_read_done, dde, zio->io_priority,
2089 2091                              ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
2090 2092                              &zio->io_bookmark));
2091 2093                  }
2092 2094                  return (ZIO_PIPELINE_CONTINUE);
2093 2095          }
2094 2096  
2095 2097          zio_nowait(zio_read(zio, zio->io_spa, bp,
2096 2098              zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
2097 2099              ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
2098 2100  
2099 2101          return (ZIO_PIPELINE_CONTINUE);
2100 2102  }
2101 2103  
2102 2104  static int
2103 2105  zio_ddt_read_done(zio_t *zio)
2104 2106  {
2105 2107          blkptr_t *bp = zio->io_bp;
2106 2108  
2107 2109          if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
2108 2110                  return (ZIO_PIPELINE_STOP);
2109 2111  
2110 2112          ASSERT(BP_GET_DEDUP(bp));
2111 2113          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
2112 2114          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2113 2115  
2114 2116          if (zio->io_child_error[ZIO_CHILD_DDT]) {
2115 2117                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
2116 2118                  ddt_entry_t *dde = zio->io_vsd;
2117 2119                  if (ddt == NULL) {
2118 2120                          ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
2119 2121                          return (ZIO_PIPELINE_CONTINUE);
2120 2122                  }
2121 2123                  if (dde == NULL) {
2122 2124                          zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
2123 2125                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
2124 2126                          return (ZIO_PIPELINE_STOP);
2125 2127                  }
2126 2128                  if (dde->dde_repair_data != NULL) {
2127 2129                          bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
2128 2130                          zio->io_child_error[ZIO_CHILD_DDT] = 0;
2129 2131                  }
2130 2132                  ddt_repair_done(ddt, dde);
2131 2133                  zio->io_vsd = NULL;
2132 2134          }
2133 2135  
2134 2136          ASSERT(zio->io_vsd == NULL);
2135 2137  
2136 2138          return (ZIO_PIPELINE_CONTINUE);
2137 2139  }
2138 2140  
2139 2141  static boolean_t
2140 2142  zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
2141 2143  {
2142 2144          spa_t *spa = zio->io_spa;
2143 2145  
2144 2146          /*
2145 2147           * Note: we compare the original data, not the transformed data,
2146 2148           * because when zio->io_bp is an override bp, we will not have
2147 2149           * pushed the I/O transforms.  That's an important optimization
2148 2150           * because otherwise we'd compress/encrypt all dmu_sync() data twice.
2149 2151           */
2150 2152          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2151 2153                  zio_t *lio = dde->dde_lead_zio[p];
2152 2154  
2153 2155                  if (lio != NULL) {
2154 2156                          return (lio->io_orig_size != zio->io_orig_size ||
2155 2157                              bcmp(zio->io_orig_data, lio->io_orig_data,
2156 2158                              zio->io_orig_size) != 0);
2157 2159                  }
2158 2160          }
2159 2161  
2160 2162          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
2161 2163                  ddt_phys_t *ddp = &dde->dde_phys[p];
2162 2164  
2163 2165                  if (ddp->ddp_phys_birth != 0) {
2164 2166                          arc_buf_t *abuf = NULL;
2165 2167                          arc_flags_t aflags = ARC_FLAG_WAIT;
2166 2168                          blkptr_t blk = *zio->io_bp;
2167 2169                          int error;
2168 2170  
2169 2171                          ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
2170 2172  
2171 2173                          ddt_exit(ddt);
2172 2174  
2173 2175                          error = arc_read(NULL, spa, &blk,
2174 2176                              arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
2175 2177                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
2176 2178                              &aflags, &zio->io_bookmark);
2177 2179  
2178 2180                          if (error == 0) {
2179 2181                                  if (arc_buf_size(abuf) != zio->io_orig_size ||
2180 2182                                      bcmp(abuf->b_data, zio->io_orig_data,
2181 2183                                      zio->io_orig_size) != 0)
2182 2184                                          error = SET_ERROR(EEXIST);
2183 2185                                  VERIFY(arc_buf_remove_ref(abuf, &abuf));
2184 2186                          }
2185 2187  
2186 2188                          ddt_enter(ddt);
2187 2189                          return (error != 0);
2188 2190                  }
2189 2191          }
2190 2192  
2191 2193          return (B_FALSE);
2192 2194  }
2193 2195  
2194 2196  static void
2195 2197  zio_ddt_child_write_ready(zio_t *zio)
2196 2198  {
2197 2199          int p = zio->io_prop.zp_copies;
2198 2200          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2199 2201          ddt_entry_t *dde = zio->io_private;
2200 2202          ddt_phys_t *ddp = &dde->dde_phys[p];
2201 2203          zio_t *pio;
2202 2204  
2203 2205          if (zio->io_error)
2204 2206                  return;
2205 2207  
2206 2208          ddt_enter(ddt);
2207 2209  
2208 2210          ASSERT(dde->dde_lead_zio[p] == zio);
2209 2211  
2210 2212          ddt_phys_fill(ddp, zio->io_bp);
2211 2213  
2212 2214          while ((pio = zio_walk_parents(zio)) != NULL)
2213 2215                  ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
2214 2216  
2215 2217          ddt_exit(ddt);
2216 2218  }
2217 2219  
2218 2220  static void
2219 2221  zio_ddt_child_write_done(zio_t *zio)
2220 2222  {
2221 2223          int p = zio->io_prop.zp_copies;
2222 2224          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
2223 2225          ddt_entry_t *dde = zio->io_private;
2224 2226          ddt_phys_t *ddp = &dde->dde_phys[p];
2225 2227  
2226 2228          ddt_enter(ddt);
2227 2229  
2228 2230          ASSERT(ddp->ddp_refcnt == 0);
2229 2231          ASSERT(dde->dde_lead_zio[p] == zio);
2230 2232          dde->dde_lead_zio[p] = NULL;
2231 2233  
2232 2234          if (zio->io_error == 0) {
2233 2235                  while (zio_walk_parents(zio) != NULL)
2234 2236                          ddt_phys_addref(ddp);
2235 2237          } else {
2236 2238                  ddt_phys_clear(ddp);
2237 2239          }
2238 2240  
2239 2241          ddt_exit(ddt);
2240 2242  }
2241 2243  
2242 2244  static void
2243 2245  zio_ddt_ditto_write_done(zio_t *zio)
2244 2246  {
2245 2247          int p = DDT_PHYS_DITTO;
2246 2248          zio_prop_t *zp = &zio->io_prop;
2247 2249          blkptr_t *bp = zio->io_bp;
2248 2250          ddt_t *ddt = ddt_select(zio->io_spa, bp);
2249 2251          ddt_entry_t *dde = zio->io_private;
2250 2252          ddt_phys_t *ddp = &dde->dde_phys[p];
2251 2253          ddt_key_t *ddk = &dde->dde_key;
2252 2254  
2253 2255          ddt_enter(ddt);
2254 2256  
2255 2257          ASSERT(ddp->ddp_refcnt == 0);
2256 2258          ASSERT(dde->dde_lead_zio[p] == zio);
2257 2259          dde->dde_lead_zio[p] = NULL;
2258 2260  
2259 2261          if (zio->io_error == 0) {
2260 2262                  ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
2261 2263                  ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
2262 2264                  ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
2263 2265                  if (ddp->ddp_phys_birth != 0)
2264 2266                          ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
2265 2267                  ddt_phys_fill(ddp, bp);
2266 2268          }
2267 2269  
2268 2270          ddt_exit(ddt);
2269 2271  }
2270 2272  
2271 2273  static int
2272 2274  zio_ddt_write(zio_t *zio)
2273 2275  {
2274 2276          spa_t *spa = zio->io_spa;
2275 2277          blkptr_t *bp = zio->io_bp;
2276 2278          uint64_t txg = zio->io_txg;
2277 2279          zio_prop_t *zp = &zio->io_prop;
2278 2280          int p = zp->zp_copies;
2279 2281          int ditto_copies;
2280 2282          zio_t *cio = NULL;
2281 2283          zio_t *dio = NULL;
2282 2284          ddt_t *ddt = ddt_select(spa, bp);
2283 2285          ddt_entry_t *dde;
2284 2286          ddt_phys_t *ddp;
2285 2287  
2286 2288          ASSERT(BP_GET_DEDUP(bp));
2287 2289          ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2288 2290          ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2289 2291  
2290 2292          ddt_enter(ddt);
2291 2293          dde = ddt_lookup(ddt, bp, B_TRUE);
2292 2294          ddp = &dde->dde_phys[p];
2293 2295  
2294 2296          if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2295 2297                  /*
2296 2298                   * If we're using a weak checksum, upgrade to a strong checksum
2297 2299                   * and try again.  If we're already using a strong checksum,
2298 2300                   * we can't resolve it, so just convert to an ordinary write.
2299 2301                   * (And automatically e-mail a paper to Nature?)
2300 2302                   */
2301 2303                  if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2302 2304                          zp->zp_checksum = spa_dedup_checksum(spa);
2303 2305                          zio_pop_transforms(zio);
2304 2306                          zio->io_stage = ZIO_STAGE_OPEN;
2305 2307                          BP_ZERO(bp);
2306 2308                  } else {
2307 2309                          zp->zp_dedup = B_FALSE;
2308 2310                  }
2309 2311                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
2310 2312                  ddt_exit(ddt);
2311 2313                  return (ZIO_PIPELINE_CONTINUE);
2312 2314          }
2313 2315  
2314 2316          ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2315 2317          ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2316 2318  
2317 2319          if (ditto_copies > ddt_ditto_copies_present(dde) &&
2318 2320              dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2319 2321                  zio_prop_t czp = *zp;
2320 2322  
2321 2323                  czp.zp_copies = ditto_copies;
2322 2324  
2323 2325                  /*
2324 2326                   * If we arrived here with an override bp, we won't have run
2325 2327                   * the transform stack, so we won't have the data we need to
2326 2328                   * generate a child i/o.  So, toss the override bp and restart.
2327 2329                   * This is safe, because using the override bp is just an
2328 2330                   * optimization; and it's rare, so the cost doesn't matter.
2329 2331                   */
2330 2332                  if (zio->io_bp_override) {
2331 2333                          zio_pop_transforms(zio);
2332 2334                          zio->io_stage = ZIO_STAGE_OPEN;
2333 2335                          zio->io_pipeline = ZIO_WRITE_PIPELINE;
2334 2336                          zio->io_bp_override = NULL;
2335 2337                          BP_ZERO(bp);
2336 2338                          ddt_exit(ddt);
2337 2339                          return (ZIO_PIPELINE_CONTINUE);
2338 2340                  }
2339 2341  
2340 2342                  dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2341 2343                      zio->io_orig_size, &czp, NULL, NULL,
2342 2344                      zio_ddt_ditto_write_done, dde, zio->io_priority,
2343 2345                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2344 2346  
2345 2347                  zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2346 2348                  dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2347 2349          }
2348 2350  
2349 2351          if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2350 2352                  if (ddp->ddp_phys_birth != 0)
2351 2353                          ddt_bp_fill(ddp, bp, txg);
2352 2354                  if (dde->dde_lead_zio[p] != NULL)
2353 2355                          zio_add_child(zio, dde->dde_lead_zio[p]);
2354 2356                  else
2355 2357                          ddt_phys_addref(ddp);
2356 2358          } else if (zio->io_bp_override) {
2357 2359                  ASSERT(bp->blk_birth == txg);
2358 2360                  ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2359 2361                  ddt_phys_fill(ddp, bp);
2360 2362                  ddt_phys_addref(ddp);
2361 2363          } else {
2362 2364                  cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2363 2365                      zio->io_orig_size, zp, zio_ddt_child_write_ready, NULL,
2364 2366                      zio_ddt_child_write_done, dde, zio->io_priority,
2365 2367                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2366 2368  
2367 2369                  zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2368 2370                  dde->dde_lead_zio[p] = cio;
2369 2371          }
2370 2372  
2371 2373          ddt_exit(ddt);
2372 2374  
2373 2375          if (cio)
2374 2376                  zio_nowait(cio);
2375 2377          if (dio)
2376 2378                  zio_nowait(dio);
2377 2379  
2378 2380          return (ZIO_PIPELINE_CONTINUE);
2379 2381  }
2380 2382  
2381 2383  ddt_entry_t *freedde; /* for debugging */
2382 2384  
2383 2385  static int
2384 2386  zio_ddt_free(zio_t *zio)
2385 2387  {
2386 2388          spa_t *spa = zio->io_spa;
2387 2389          blkptr_t *bp = zio->io_bp;
2388 2390          ddt_t *ddt = ddt_select(spa, bp);
2389 2391          ddt_entry_t *dde;
2390 2392          ddt_phys_t *ddp;
2391 2393  
2392 2394          ASSERT(BP_GET_DEDUP(bp));
2393 2395          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2394 2396  
2395 2397          ddt_enter(ddt);
2396 2398          freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2397 2399          ddp = ddt_phys_select(dde, bp);
2398 2400          ddt_phys_decref(ddp);
2399 2401          ddt_exit(ddt);
2400 2402  
2401 2403          return (ZIO_PIPELINE_CONTINUE);
2402 2404  }
2403 2405  
2404 2406  /*
2405 2407   * ==========================================================================
2406 2408   * Allocate and free blocks
2407 2409   * ==========================================================================
2408 2410   */
2409 2411  static int
2410 2412  zio_dva_allocate(zio_t *zio)
2411 2413  {
2412 2414          spa_t *spa = zio->io_spa;
2413 2415          metaslab_class_t *mc = spa_normal_class(spa);
2414 2416          blkptr_t *bp = zio->io_bp;
2415 2417          int error;
2416 2418          int flags = 0;
2417 2419  
2418 2420          if (zio->io_gang_leader == NULL) {
2419 2421                  ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2420 2422                  zio->io_gang_leader = zio;
2421 2423          }
2422 2424  
2423 2425          ASSERT(BP_IS_HOLE(bp));
2424 2426          ASSERT0(BP_GET_NDVAS(bp));
2425 2427          ASSERT3U(zio->io_prop.zp_copies, >, 0);
2426 2428          ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2427 2429          ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2428 2430  
2429 2431          /*
2430 2432           * The dump device does not support gang blocks so allocation on
2431 2433           * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2432 2434           * the "fast" gang feature.
2433 2435           */
2434 2436          flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2435 2437          flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2436 2438              METASLAB_GANG_CHILD : 0;
2437 2439          error = metaslab_alloc(spa, mc, zio->io_size, bp,
2438 2440              zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2439 2441  
2440 2442          if (error) {
2441 2443                  spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2442 2444                      "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2443 2445                      error);
2444 2446                  if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2445 2447                          return (zio_write_gang_block(zio));
2446 2448                  zio->io_error = error;
2447 2449          }
2448 2450  
2449 2451          return (ZIO_PIPELINE_CONTINUE);
2450 2452  }
2451 2453  
2452 2454  static int
2453 2455  zio_dva_free(zio_t *zio)
2454 2456  {
2455 2457          metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2456 2458  
2457 2459          return (ZIO_PIPELINE_CONTINUE);
2458 2460  }
2459 2461  
2460 2462  static int
2461 2463  zio_dva_claim(zio_t *zio)
2462 2464  {
2463 2465          int error;
2464 2466  
2465 2467          error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2466 2468          if (error)
2467 2469                  zio->io_error = error;
2468 2470  
2469 2471          return (ZIO_PIPELINE_CONTINUE);
2470 2472  }
2471 2473  
2472 2474  /*
2473 2475   * Undo an allocation.  This is used by zio_done() when an I/O fails
2474 2476   * and we want to give back the block we just allocated.
2475 2477   * This handles both normal blocks and gang blocks.
2476 2478   */
2477 2479  static void
2478 2480  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2479 2481  {
2480 2482          ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2481 2483          ASSERT(zio->io_bp_override == NULL);
2482 2484  
2483 2485          if (!BP_IS_HOLE(bp))
2484 2486                  metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2485 2487  
2486 2488          if (gn != NULL) {
2487 2489                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2488 2490                          zio_dva_unallocate(zio, gn->gn_child[g],
2489 2491                              &gn->gn_gbh->zg_blkptr[g]);
2490 2492                  }
2491 2493          }
2492 2494  }
2493 2495  
2494 2496  /*
2495 2497   * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2496 2498   */
2497 2499  int
2498 2500  zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2499 2501      uint64_t size, boolean_t use_slog)
2500 2502  {
2501 2503          int error = 1;
2502 2504  
2503 2505          ASSERT(txg > spa_syncing_txg(spa));
2504 2506  
2505 2507          /*
2506 2508           * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2507 2509           * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2508 2510           * when allocating them.
2509 2511           */
2510 2512          if (use_slog) {
2511 2513                  error = metaslab_alloc(spa, spa_log_class(spa), size,
2512 2514                      new_bp, 1, txg, old_bp,
2513 2515                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2514 2516          }
2515 2517  
2516 2518          if (error) {
2517 2519                  error = metaslab_alloc(spa, spa_normal_class(spa), size,
2518 2520                      new_bp, 1, txg, old_bp,
2519 2521                      METASLAB_HINTBP_AVOID);
2520 2522          }
2521 2523  
2522 2524          if (error == 0) {
2523 2525                  BP_SET_LSIZE(new_bp, size);
2524 2526                  BP_SET_PSIZE(new_bp, size);
2525 2527                  BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2526 2528                  BP_SET_CHECKSUM(new_bp,
2527 2529                      spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2528 2530                      ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2529 2531                  BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2530 2532                  BP_SET_LEVEL(new_bp, 0);
2531 2533                  BP_SET_DEDUP(new_bp, 0);
2532 2534                  BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2533 2535          }
2534 2536  
2535 2537          return (error);
2536 2538  }
2537 2539  
2538 2540  /*
2539 2541   * Free an intent log block.
2540 2542   */
2541 2543  void
2542 2544  zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2543 2545  {
2544 2546          ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2545 2547          ASSERT(!BP_IS_GANG(bp));
2546 2548  
2547 2549          zio_free(spa, txg, bp);
2548 2550  }
2549 2551  
2550 2552  /*
2551 2553   * ==========================================================================
2552 2554   * Read and write to physical devices
2553 2555   * ==========================================================================
2554 2556   */
2555 2557  
2556 2558  
2557 2559  /*
2558 2560   * Issue an I/O to the underlying vdev. Typically the issue pipeline
2559 2561   * stops after this stage and will resume upon I/O completion.
2560 2562   * However, there are instances where the vdev layer may need to
2561 2563   * continue the pipeline when an I/O was not issued. Since the I/O
2562 2564   * that was sent to the vdev layer might be different than the one
2563 2565   * currently active in the pipeline (see vdev_queue_io()), we explicitly
2564 2566   * force the underlying vdev layers to call either zio_execute() or
2565 2567   * zio_interrupt() to ensure that the pipeline continues with the correct I/O.
2566 2568   */
2567 2569  static int
2568 2570  zio_vdev_io_start(zio_t *zio)
2569 2571  {
2570 2572          vdev_t *vd = zio->io_vd;
2571 2573          uint64_t align;
2572 2574          spa_t *spa = zio->io_spa;
2573 2575  
2574 2576          ASSERT(zio->io_error == 0);
2575 2577          ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2576 2578  
2577 2579          if (vd == NULL) {
2578 2580                  if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2579 2581                          spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2580 2582  
2581 2583                  /*
2582 2584                   * The mirror_ops handle multiple DVAs in a single BP.
2583 2585                   */
2584 2586                  vdev_mirror_ops.vdev_op_io_start(zio);
2585 2587                  return (ZIO_PIPELINE_STOP);
2586 2588          }
2587 2589  
2588 2590          /*
2589 2591           * We keep track of time-sensitive I/Os so that the scan thread
2590 2592           * can quickly react to certain workloads.  In particular, we care
2591 2593           * about non-scrubbing, top-level reads and writes with the following
2592 2594           * characteristics:
2593 2595           *      - synchronous writes of user data to non-slog devices
2594 2596           *      - any reads of user data
2595 2597           * When these conditions are met, adjust the timestamp of spa_last_io
2596 2598           * which allows the scan thread to adjust its workload accordingly.
2597 2599           */
2598 2600          if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2599 2601              vd == vd->vdev_top && !vd->vdev_islog &&
2600 2602              zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2601 2603              zio->io_txg != spa_syncing_txg(spa)) {
2602 2604                  uint64_t old = spa->spa_last_io;
2603 2605                  uint64_t new = ddi_get_lbolt64();
2604 2606                  if (old != new)
2605 2607                          (void) atomic_cas_64(&spa->spa_last_io, old, new);
2606 2608          }
2607 2609  
2608 2610          align = 1ULL << vd->vdev_top->vdev_ashift;
2609 2611  
2610 2612          if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) &&
2611 2613              P2PHASE(zio->io_size, align) != 0) {
2612 2614                  /* Transform logical writes to be a full physical block size. */
2613 2615                  uint64_t asize = P2ROUNDUP(zio->io_size, align);
2614 2616                  char *abuf = zio_buf_alloc(asize);
2615 2617                  ASSERT(vd == vd->vdev_top);
2616 2618                  if (zio->io_type == ZIO_TYPE_WRITE) {
2617 2619                          bcopy(zio->io_data, abuf, zio->io_size);
2618 2620                          bzero(abuf + zio->io_size, asize - zio->io_size);
2619 2621                  }
2620 2622                  zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2621 2623          }
2622 2624  
2623 2625          /*
2624 2626           * If this is not a physical io, make sure that it is properly aligned
2625 2627           * before proceeding.
2626 2628           */
2627 2629          if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) {
2628 2630                  ASSERT0(P2PHASE(zio->io_offset, align));
2629 2631                  ASSERT0(P2PHASE(zio->io_size, align));
2630 2632          } else {
2631 2633                  /*
2632 2634                   * For physical writes, we allow 512b aligned writes and assume
2633 2635                   * the device will perform a read-modify-write as necessary.
2634 2636                   */
2635 2637                  ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE));
2636 2638                  ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE));
2637 2639          }
2638 2640  
2639 2641          VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2640 2642  
2641 2643          /*
2642 2644           * If this is a repair I/O, and there's no self-healing involved --
2643 2645           * that is, we're just resilvering what we expect to resilver --
2644 2646           * then don't do the I/O unless zio's txg is actually in vd's DTL.
2645 2647           * This prevents spurious resilvering with nested replication.
2646 2648           * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2647 2649           * A is out of date, we'll read from C+D, then use the data to
2648 2650           * resilver A+B -- but we don't actually want to resilver B, just A.
2649 2651           * The top-level mirror has no way to know this, so instead we just
2650 2652           * discard unnecessary repairs as we work our way down the vdev tree.
2651 2653           * The same logic applies to any form of nested replication:
2652 2654           * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2653 2655           */
2654 2656          if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2655 2657              !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2656 2658              zio->io_txg != 0 && /* not a delegated i/o */
2657 2659              !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2658 2660                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2659 2661                  zio_vdev_io_bypass(zio);
2660 2662                  return (ZIO_PIPELINE_CONTINUE);
2661 2663          }
2662 2664  
2663 2665          if (vd->vdev_ops->vdev_op_leaf &&
2664 2666              (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2665 2667  
2666 2668                  if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio))
2667 2669                          return (ZIO_PIPELINE_CONTINUE);
2668 2670  
2669 2671                  if ((zio = vdev_queue_io(zio)) == NULL)
2670 2672                          return (ZIO_PIPELINE_STOP);
2671 2673  
2672 2674                  if (!vdev_accessible(vd, zio)) {
2673 2675                          zio->io_error = SET_ERROR(ENXIO);
2674 2676                          zio_interrupt(zio);
2675 2677                          return (ZIO_PIPELINE_STOP);
2676 2678                  }
2677 2679          }
2678 2680  
2679 2681          vd->vdev_ops->vdev_op_io_start(zio);
2680 2682          return (ZIO_PIPELINE_STOP);
2681 2683  }
2682 2684  
2683 2685  static int
2684 2686  zio_vdev_io_done(zio_t *zio)
2685 2687  {
2686 2688          vdev_t *vd = zio->io_vd;
2687 2689          vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2688 2690          boolean_t unexpected_error = B_FALSE;
2689 2691  
2690 2692          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2691 2693                  return (ZIO_PIPELINE_STOP);
2692 2694  
2693 2695          ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2694 2696  
2695 2697          if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2696 2698  
2697 2699                  vdev_queue_io_done(zio);
2698 2700  
2699 2701                  if (zio->io_type == ZIO_TYPE_WRITE)
2700 2702                          vdev_cache_write(zio);
2701 2703  
2702 2704                  if (zio_injection_enabled && zio->io_error == 0)
2703 2705                          zio->io_error = zio_handle_device_injection(vd,
2704 2706                              zio, EIO);
2705 2707  
2706 2708                  if (zio_injection_enabled && zio->io_error == 0)
2707 2709                          zio->io_error = zio_handle_label_injection(zio, EIO);
2708 2710  
2709 2711                  if (zio->io_error) {
2710 2712                          if (!vdev_accessible(vd, zio)) {
2711 2713                                  zio->io_error = SET_ERROR(ENXIO);
2712 2714                          } else {
2713 2715                                  unexpected_error = B_TRUE;
2714 2716                          }
2715 2717                  }
2716 2718          }
2717 2719  
2718 2720          ops->vdev_op_io_done(zio);
2719 2721  
2720 2722          if (unexpected_error)
2721 2723                  VERIFY(vdev_probe(vd, zio) == NULL);
2722 2724  
2723 2725          return (ZIO_PIPELINE_CONTINUE);
2724 2726  }
2725 2727  
2726 2728  /*
2727 2729   * For non-raidz ZIOs, we can just copy aside the bad data read from the
2728 2730   * disk, and use that to finish the checksum ereport later.
2729 2731   */
2730 2732  static void
2731 2733  zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2732 2734      const void *good_buf)
2733 2735  {
2734 2736          /* no processing needed */
2735 2737          zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2736 2738  }
2737 2739  
2738 2740  /*ARGSUSED*/
2739 2741  void
2740 2742  zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2741 2743  {
2742 2744          void *buf = zio_buf_alloc(zio->io_size);
2743 2745  
2744 2746          bcopy(zio->io_data, buf, zio->io_size);
2745 2747  
2746 2748          zcr->zcr_cbinfo = zio->io_size;
2747 2749          zcr->zcr_cbdata = buf;
2748 2750          zcr->zcr_finish = zio_vsd_default_cksum_finish;
2749 2751          zcr->zcr_free = zio_buf_free;
2750 2752  }
2751 2753  
2752 2754  static int
2753 2755  zio_vdev_io_assess(zio_t *zio)
2754 2756  {
2755 2757          vdev_t *vd = zio->io_vd;
2756 2758  
2757 2759          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2758 2760                  return (ZIO_PIPELINE_STOP);
2759 2761  
2760 2762          if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2761 2763                  spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2762 2764  
2763 2765          if (zio->io_vsd != NULL) {
2764 2766                  zio->io_vsd_ops->vsd_free(zio);
2765 2767                  zio->io_vsd = NULL;
2766 2768          }
2767 2769  
2768 2770          if (zio_injection_enabled && zio->io_error == 0)
2769 2771                  zio->io_error = zio_handle_fault_injection(zio, EIO);
2770 2772  
2771 2773          /*
2772 2774           * If the I/O failed, determine whether we should attempt to retry it.
2773 2775           *
2774 2776           * On retry, we cut in line in the issue queue, since we don't want
2775 2777           * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2776 2778           */
2777 2779          if (zio->io_error && vd == NULL &&
2778 2780              !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2779 2781                  ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2780 2782                  ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2781 2783                  zio->io_error = 0;
2782 2784                  zio->io_flags |= ZIO_FLAG_IO_RETRY |
2783 2785                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2784 2786                  zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2785 2787                  zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2786 2788                      zio_requeue_io_start_cut_in_line);
2787 2789                  return (ZIO_PIPELINE_STOP);
2788 2790          }
2789 2791  
2790 2792          /*
2791 2793           * If we got an error on a leaf device, convert it to ENXIO
2792 2794           * if the device is not accessible at all.
2793 2795           */
2794 2796          if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2795 2797              !vdev_accessible(vd, zio))
2796 2798                  zio->io_error = SET_ERROR(ENXIO);
2797 2799  
2798 2800          /*
2799 2801           * If we can't write to an interior vdev (mirror or RAID-Z),
2800 2802           * set vdev_cant_write so that we stop trying to allocate from it.
2801 2803           */
2802 2804          if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2803 2805              vd != NULL && !vd->vdev_ops->vdev_op_leaf) {
2804 2806                  vd->vdev_cant_write = B_TRUE;
2805 2807          }
2806 2808  
2807 2809          if (zio->io_error)
2808 2810                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2809 2811  
2810 2812          if (vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2811 2813              zio->io_physdone != NULL) {
2812 2814                  ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED));
2813 2815                  ASSERT(zio->io_child_type == ZIO_CHILD_VDEV);
2814 2816                  zio->io_physdone(zio->io_logical);
2815 2817          }
2816 2818  
2817 2819          return (ZIO_PIPELINE_CONTINUE);
2818 2820  }
2819 2821  
2820 2822  void
2821 2823  zio_vdev_io_reissue(zio_t *zio)
2822 2824  {
2823 2825          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2824 2826          ASSERT(zio->io_error == 0);
2825 2827  
2826 2828          zio->io_stage >>= 1;
2827 2829  }
2828 2830  
2829 2831  void
2830 2832  zio_vdev_io_redone(zio_t *zio)
2831 2833  {
2832 2834          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2833 2835  
2834 2836          zio->io_stage >>= 1;
2835 2837  }
2836 2838  
2837 2839  void
2838 2840  zio_vdev_io_bypass(zio_t *zio)
2839 2841  {
2840 2842          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2841 2843          ASSERT(zio->io_error == 0);
2842 2844  
2843 2845          zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2844 2846          zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2845 2847  }
2846 2848  
2847 2849  /*
2848 2850   * ==========================================================================
2849 2851   * Generate and verify checksums
2850 2852   * ==========================================================================
2851 2853   */
2852 2854  static int
2853 2855  zio_checksum_generate(zio_t *zio)
2854 2856  {
2855 2857          blkptr_t *bp = zio->io_bp;
2856 2858          enum zio_checksum checksum;
2857 2859  
2858 2860          if (bp == NULL) {
2859 2861                  /*
2860 2862                   * This is zio_write_phys().
2861 2863                   * We're either generating a label checksum, or none at all.
2862 2864                   */
2863 2865                  checksum = zio->io_prop.zp_checksum;
2864 2866  
2865 2867                  if (checksum == ZIO_CHECKSUM_OFF)
2866 2868                          return (ZIO_PIPELINE_CONTINUE);
2867 2869  
2868 2870                  ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2869 2871          } else {
2870 2872                  if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2871 2873                          ASSERT(!IO_IS_ALLOCATING(zio));
2872 2874                          checksum = ZIO_CHECKSUM_GANG_HEADER;
2873 2875                  } else {
2874 2876                          checksum = BP_GET_CHECKSUM(bp);
2875 2877                  }
2876 2878          }
2877 2879  
2878 2880          zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2879 2881  
2880 2882          return (ZIO_PIPELINE_CONTINUE);
2881 2883  }
2882 2884  
2883 2885  static int
2884 2886  zio_checksum_verify(zio_t *zio)
2885 2887  {
2886 2888          zio_bad_cksum_t info;
2887 2889          blkptr_t *bp = zio->io_bp;
2888 2890          int error;
2889 2891  
2890 2892          ASSERT(zio->io_vd != NULL);
2891 2893  
2892 2894          if (bp == NULL) {
2893 2895                  /*
2894 2896                   * This is zio_read_phys().
2895 2897                   * We're either verifying a label checksum, or nothing at all.
2896 2898                   */
2897 2899                  if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2898 2900                          return (ZIO_PIPELINE_CONTINUE);
2899 2901  
2900 2902                  ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2901 2903          }
2902 2904  
2903 2905          if ((error = zio_checksum_error(zio, &info)) != 0) {
2904 2906                  zio->io_error = error;
2905 2907                  if (error == ECKSUM &&
2906 2908                      !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2907 2909                          zfs_ereport_start_checksum(zio->io_spa,
2908 2910                              zio->io_vd, zio, zio->io_offset,
2909 2911                              zio->io_size, NULL, &info);
2910 2912                  }
2911 2913          }
2912 2914  
2913 2915          return (ZIO_PIPELINE_CONTINUE);
2914 2916  }
2915 2917  
2916 2918  /*
2917 2919   * Called by RAID-Z to ensure we don't compute the checksum twice.
2918 2920   */
2919 2921  void
2920 2922  zio_checksum_verified(zio_t *zio)
2921 2923  {
2922 2924          zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2923 2925  }
2924 2926  
2925 2927  /*
2926 2928   * ==========================================================================
2927 2929   * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2928 2930   * An error of 0 indicates success.  ENXIO indicates whole-device failure,
2929 2931   * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2930 2932   * indicate errors that are specific to one I/O, and most likely permanent.
2931 2933   * Any other error is presumed to be worse because we weren't expecting it.
2932 2934   * ==========================================================================
2933 2935   */
2934 2936  int
2935 2937  zio_worst_error(int e1, int e2)
2936 2938  {
2937 2939          static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2938 2940          int r1, r2;
2939 2941  
2940 2942          for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2941 2943                  if (e1 == zio_error_rank[r1])
2942 2944                          break;
2943 2945  
2944 2946          for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2945 2947                  if (e2 == zio_error_rank[r2])
2946 2948                          break;
2947 2949  
2948 2950          return (r1 > r2 ? e1 : e2);
2949 2951  }
2950 2952  
2951 2953  /*
2952 2954   * ==========================================================================
2953 2955   * I/O completion
2954 2956   * ==========================================================================
2955 2957   */
2956 2958  static int
2957 2959  zio_ready(zio_t *zio)
2958 2960  {
2959 2961          blkptr_t *bp = zio->io_bp;
2960 2962          zio_t *pio, *pio_next;
2961 2963  
2962 2964          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2963 2965              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2964 2966                  return (ZIO_PIPELINE_STOP);
2965 2967  
2966 2968          if (zio->io_ready) {
2967 2969                  ASSERT(IO_IS_ALLOCATING(zio));
2968 2970                  ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) ||
2969 2971                      (zio->io_flags & ZIO_FLAG_NOPWRITE));
2970 2972                  ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2971 2973  
2972 2974                  zio->io_ready(zio);
2973 2975          }
2974 2976  
2975 2977          if (bp != NULL && bp != &zio->io_bp_copy)
2976 2978                  zio->io_bp_copy = *bp;
2977 2979  
2978 2980          if (zio->io_error)
2979 2981                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2980 2982  
2981 2983          mutex_enter(&zio->io_lock);
2982 2984          zio->io_state[ZIO_WAIT_READY] = 1;
2983 2985          pio = zio_walk_parents(zio);
2984 2986          mutex_exit(&zio->io_lock);
2985 2987  
2986 2988          /*
2987 2989           * As we notify zio's parents, new parents could be added.
2988 2990           * New parents go to the head of zio's io_parent_list, however,
2989 2991           * so we will (correctly) not notify them.  The remainder of zio's
2990 2992           * io_parent_list, from 'pio_next' onward, cannot change because
2991 2993           * all parents must wait for us to be done before they can be done.
2992 2994           */
2993 2995          for (; pio != NULL; pio = pio_next) {
2994 2996                  pio_next = zio_walk_parents(zio);
2995 2997                  zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2996 2998          }
2997 2999  
2998 3000          if (zio->io_flags & ZIO_FLAG_NODATA) {
2999 3001                  if (BP_IS_GANG(bp)) {
3000 3002                          zio->io_flags &= ~ZIO_FLAG_NODATA;
3001 3003                  } else {
3002 3004                          ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
3003 3005                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
3004 3006                  }
3005 3007          }
3006 3008  
3007 3009          if (zio_injection_enabled &&
3008 3010              zio->io_spa->spa_syncing_txg == zio->io_txg)
3009 3011                  zio_handle_ignored_writes(zio);
3010 3012  
3011 3013          return (ZIO_PIPELINE_CONTINUE);
3012 3014  }
3013 3015  
3014 3016  static int
3015 3017  zio_done(zio_t *zio)
3016 3018  {
3017 3019          spa_t *spa = zio->io_spa;
3018 3020          zio_t *lio = zio->io_logical;
3019 3021          blkptr_t *bp = zio->io_bp;
3020 3022          vdev_t *vd = zio->io_vd;
3021 3023          uint64_t psize = zio->io_size;
3022 3024          zio_t *pio, *pio_next;
3023 3025  
3024 3026          /*
3025 3027           * If our children haven't all completed,
3026 3028           * wait for them and then repeat this pipeline stage.
3027 3029           */
3028 3030          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
3029 3031              zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
3030 3032              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
3031 3033              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
3032 3034                  return (ZIO_PIPELINE_STOP);
3033 3035  
3034 3036          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
3035 3037                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
3036 3038                          ASSERT(zio->io_children[c][w] == 0);
3037 3039  
3038 3040          if (bp != NULL && !BP_IS_EMBEDDED(bp)) {
3039 3041                  ASSERT(bp->blk_pad[0] == 0);
3040 3042                  ASSERT(bp->blk_pad[1] == 0);
3041 3043                  ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
3042 3044                      (bp == zio_unique_parent(zio)->io_bp));
3043 3045                  if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
3044 3046                      zio->io_bp_override == NULL &&
3045 3047                      !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
3046 3048                          ASSERT(!BP_SHOULD_BYTESWAP(bp));
3047 3049                          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
3048 3050                          ASSERT(BP_COUNT_GANG(bp) == 0 ||
3049 3051                              (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
3050 3052                  }
3051 3053                  if (zio->io_flags & ZIO_FLAG_NOPWRITE)
3052 3054                          VERIFY(BP_EQUAL(bp, &zio->io_bp_orig));
3053 3055          }
3054 3056  
3055 3057          /*
3056 3058           * If there were child vdev/gang/ddt errors, they apply to us now.
3057 3059           */
3058 3060          zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
3059 3061          zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
3060 3062          zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
3061 3063  
3062 3064          /*
3063 3065           * If the I/O on the transformed data was successful, generate any
3064 3066           * checksum reports now while we still have the transformed data.
3065 3067           */
3066 3068          if (zio->io_error == 0) {
3067 3069                  while (zio->io_cksum_report != NULL) {
3068 3070                          zio_cksum_report_t *zcr = zio->io_cksum_report;
3069 3071                          uint64_t align = zcr->zcr_align;
3070 3072                          uint64_t asize = P2ROUNDUP(psize, align);
3071 3073                          char *abuf = zio->io_data;
3072 3074  
3073 3075                          if (asize != psize) {
3074 3076                                  abuf = zio_buf_alloc(asize);
3075 3077                                  bcopy(zio->io_data, abuf, psize);
3076 3078                                  bzero(abuf + psize, asize - psize);
3077 3079                          }
3078 3080  
3079 3081                          zio->io_cksum_report = zcr->zcr_next;
3080 3082                          zcr->zcr_next = NULL;
3081 3083                          zcr->zcr_finish(zcr, abuf);
3082 3084                          zfs_ereport_free_checksum(zcr);
3083 3085  
3084 3086                          if (asize != psize)
3085 3087                                  zio_buf_free(abuf, asize);
3086 3088                  }
3087 3089          }
3088 3090  
3089 3091          zio_pop_transforms(zio);        /* note: may set zio->io_error */
3090 3092  
3091 3093          vdev_stat_update(zio, psize);
3092 3094  
3093 3095          if (zio->io_error) {
3094 3096                  /*
3095 3097                   * If this I/O is attached to a particular vdev,
3096 3098                   * generate an error message describing the I/O failure
3097 3099                   * at the block level.  We ignore these errors if the
3098 3100                   * device is currently unavailable.
3099 3101                   */
3100 3102                  if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
3101 3103                          zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
3102 3104  
3103 3105                  if ((zio->io_error == EIO || !(zio->io_flags &
3104 3106                      (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
3105 3107                      zio == lio) {
3106 3108                          /*
3107 3109                           * For logical I/O requests, tell the SPA to log the
3108 3110                           * error and generate a logical data ereport.
3109 3111                           */
3110 3112                          spa_log_error(spa, zio);
3111 3113                          zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
3112 3114                              0, 0);
3113 3115                  }
3114 3116          }
3115 3117  
3116 3118          if (zio->io_error && zio == lio) {
3117 3119                  /*
3118 3120                   * Determine whether zio should be reexecuted.  This will
3119 3121                   * propagate all the way to the root via zio_notify_parent().
3120 3122                   */
3121 3123                  ASSERT(vd == NULL && bp != NULL);
3122 3124                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3123 3125  
3124 3126                  if (IO_IS_ALLOCATING(zio) &&
3125 3127                      !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
3126 3128                          if (zio->io_error != ENOSPC)
3127 3129                                  zio->io_reexecute |= ZIO_REEXECUTE_NOW;
3128 3130                          else
3129 3131                                  zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3130 3132                  }
3131 3133  
3132 3134                  if ((zio->io_type == ZIO_TYPE_READ ||
3133 3135                      zio->io_type == ZIO_TYPE_FREE) &&
3134 3136                      !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
3135 3137                      zio->io_error == ENXIO &&
3136 3138                      spa_load_state(spa) == SPA_LOAD_NONE &&
3137 3139                      spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
3138 3140                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3139 3141  
3140 3142                  if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
3141 3143                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
3142 3144  
3143 3145                  /*
3144 3146                   * Here is a possibly good place to attempt to do
3145 3147                   * either combinatorial reconstruction or error correction
3146 3148                   * based on checksums.  It also might be a good place
3147 3149                   * to send out preliminary ereports before we suspend
3148 3150                   * processing.
3149 3151                   */
3150 3152          }
3151 3153  
3152 3154          /*
3153 3155           * If there were logical child errors, they apply to us now.
3154 3156           * We defer this until now to avoid conflating logical child
3155 3157           * errors with errors that happened to the zio itself when
3156 3158           * updating vdev stats and reporting FMA events above.
3157 3159           */
3158 3160          zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
3159 3161  
3160 3162          if ((zio->io_error || zio->io_reexecute) &&
3161 3163              IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
3162 3164              !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)))
3163 3165                  zio_dva_unallocate(zio, zio->io_gang_tree, bp);
3164 3166  
3165 3167          zio_gang_tree_free(&zio->io_gang_tree);
3166 3168  
3167 3169          /*
3168 3170           * Godfather I/Os should never suspend.
3169 3171           */
3170 3172          if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
3171 3173              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
3172 3174                  zio->io_reexecute = 0;
3173 3175  
3174 3176          if (zio->io_reexecute) {
3175 3177                  /*
3176 3178                   * This is a logical I/O that wants to reexecute.
3177 3179                   *
3178 3180                   * Reexecute is top-down.  When an i/o fails, if it's not
3179 3181                   * the root, it simply notifies its parent and sticks around.
3180 3182                   * The parent, seeing that it still has children in zio_done(),
3181 3183                   * does the same.  This percolates all the way up to the root.
3182 3184                   * The root i/o will reexecute or suspend the entire tree.
3183 3185                   *
3184 3186                   * This approach ensures that zio_reexecute() honors
3185 3187                   * all the original i/o dependency relationships, e.g.
3186 3188                   * parents not executing until children are ready.
3187 3189                   */
3188 3190                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
3189 3191  
3190 3192                  zio->io_gang_leader = NULL;
3191 3193  
3192 3194                  mutex_enter(&zio->io_lock);
3193 3195                  zio->io_state[ZIO_WAIT_DONE] = 1;
3194 3196                  mutex_exit(&zio->io_lock);
3195 3197  
3196 3198                  /*
3197 3199                   * "The Godfather" I/O monitors its children but is
3198 3200                   * not a true parent to them. It will track them through
3199 3201                   * the pipeline but severs its ties whenever they get into
3200 3202                   * trouble (e.g. suspended). This allows "The Godfather"
3201 3203                   * I/O to return status without blocking.
3202 3204                   */
3203 3205                  for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3204 3206                          zio_link_t *zl = zio->io_walk_link;
3205 3207                          pio_next = zio_walk_parents(zio);
3206 3208  
3207 3209                          if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
3208 3210                              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
3209 3211                                  zio_remove_child(pio, zio, zl);
3210 3212                                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3211 3213                          }
3212 3214                  }
3213 3215  
3214 3216                  if ((pio = zio_unique_parent(zio)) != NULL) {
3215 3217                          /*
3216 3218                           * We're not a root i/o, so there's nothing to do
3217 3219                           * but notify our parent.  Don't propagate errors
3218 3220                           * upward since we haven't permanently failed yet.
3219 3221                           */
3220 3222                          ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
3221 3223                          zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
3222 3224                          zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3223 3225                  } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
3224 3226                          /*
3225 3227                           * We'd fail again if we reexecuted now, so suspend
3226 3228                           * until conditions improve (e.g. device comes online).
3227 3229                           */
3228 3230                          zio_suspend(spa, zio);
3229 3231                  } else {
3230 3232                          /*
3231 3233                           * Reexecution is potentially a huge amount of work.
3232 3234                           * Hand it off to the otherwise-unused claim taskq.
3233 3235                           */
3234 3236                          ASSERT(zio->io_tqent.tqent_next == NULL);
3235 3237                          spa_taskq_dispatch_ent(spa, ZIO_TYPE_CLAIM,
3236 3238                              ZIO_TASKQ_ISSUE, (task_func_t *)zio_reexecute, zio,
3237 3239                              0, &zio->io_tqent);
3238 3240                  }
3239 3241                  return (ZIO_PIPELINE_STOP);
3240 3242          }
3241 3243  
3242 3244          ASSERT(zio->io_child_count == 0);
3243 3245          ASSERT(zio->io_reexecute == 0);
3244 3246          ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
3245 3247  
3246 3248          /*
3247 3249           * Report any checksum errors, since the I/O is complete.
3248 3250           */
3249 3251          while (zio->io_cksum_report != NULL) {
3250 3252                  zio_cksum_report_t *zcr = zio->io_cksum_report;
3251 3253                  zio->io_cksum_report = zcr->zcr_next;
3252 3254                  zcr->zcr_next = NULL;
3253 3255                  zcr->zcr_finish(zcr, NULL);
3254 3256                  zfs_ereport_free_checksum(zcr);
3255 3257          }
3256 3258  
3257 3259          /*
3258 3260           * It is the responsibility of the done callback to ensure that this
3259 3261           * particular zio is no longer discoverable for adoption, and as
3260 3262           * such, cannot acquire any new parents.
3261 3263           */
3262 3264          if (zio->io_done)
3263 3265                  zio->io_done(zio);
3264 3266  
3265 3267          mutex_enter(&zio->io_lock);
3266 3268          zio->io_state[ZIO_WAIT_DONE] = 1;
3267 3269          mutex_exit(&zio->io_lock);
3268 3270  
3269 3271          for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
3270 3272                  zio_link_t *zl = zio->io_walk_link;
3271 3273                  pio_next = zio_walk_parents(zio);
3272 3274                  zio_remove_child(pio, zio, zl);
3273 3275                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
3274 3276          }
3275 3277  
3276 3278          if (zio->io_waiter != NULL) {
3277 3279                  mutex_enter(&zio->io_lock);
3278 3280                  zio->io_executor = NULL;
3279 3281                  cv_broadcast(&zio->io_cv);
3280 3282                  mutex_exit(&zio->io_lock);
3281 3283          } else {
3282 3284                  zio_destroy(zio);
3283 3285          }
3284 3286  
3285 3287          return (ZIO_PIPELINE_STOP);
3286 3288  }
3287 3289  
3288 3290  /*
3289 3291   * ==========================================================================
3290 3292   * I/O pipeline definition
3291 3293   * ==========================================================================
3292 3294   */
3293 3295  static zio_pipe_stage_t *zio_pipeline[] = {
3294 3296          NULL,
3295 3297          zio_read_bp_init,
3296 3298          zio_free_bp_init,
3297 3299          zio_issue_async,
3298 3300          zio_write_bp_init,
3299 3301          zio_checksum_generate,
3300 3302          zio_nop_write,
3301 3303          zio_ddt_read_start,
3302 3304          zio_ddt_read_done,
3303 3305          zio_ddt_write,
3304 3306          zio_ddt_free,
3305 3307          zio_gang_assemble,
3306 3308          zio_gang_issue,
3307 3309          zio_dva_allocate,
3308 3310          zio_dva_free,
3309 3311          zio_dva_claim,
3310 3312          zio_ready,
3311 3313          zio_vdev_io_start,
3312 3314          zio_vdev_io_done,
3313 3315          zio_vdev_io_assess,
3314 3316          zio_checksum_verify,
3315 3317          zio_done
3316 3318  };
3317 3319  
3318 3320  /* dnp is the dnode for zb1->zb_object */
3319 3321  boolean_t
3320 3322  zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_phys_t *zb1,
3321 3323      const zbookmark_phys_t *zb2)
3322 3324  {
3323 3325          uint64_t zb1nextL0, zb2thisobj;
3324 3326  
3325 3327          ASSERT(zb1->zb_objset == zb2->zb_objset);
3326 3328          ASSERT(zb2->zb_level == 0);
3327 3329  
3328 3330          /* The objset_phys_t isn't before anything. */
3329 3331          if (dnp == NULL)
3330 3332                  return (B_FALSE);
3331 3333  
3332 3334          zb1nextL0 = (zb1->zb_blkid + 1) <<
3333 3335              ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
3334 3336  
3335 3337          zb2thisobj = zb2->zb_object ? zb2->zb_object :
3336 3338              zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
3337 3339  
3338 3340          if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
3339 3341                  uint64_t nextobj = zb1nextL0 *
3340 3342                      (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
3341 3343                  return (nextobj <= zb2thisobj);
3342 3344          }
3343 3345  
3344 3346          if (zb1->zb_object < zb2thisobj)
3345 3347                  return (B_TRUE);
3346 3348          if (zb1->zb_object > zb2thisobj)
3347 3349                  return (B_FALSE);
3348 3350          if (zb2->zb_object == DMU_META_DNODE_OBJECT)
3349 3351                  return (B_FALSE);
3350 3352          return (zb1nextL0 <= zb2->zb_blkid);
3351 3353  }

↓ open down ↓

2206 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX