Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zio.c
          +++ new/usr/src/uts/common/fs/zfs/zio.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved.
  25   25   */
  26   26  
  27   27  #include <sys/zfs_context.h>
  28   28  #include <sys/fm/fs/zfs.h>
  29   29  #include <sys/spa.h>
  30   30  #include <sys/txg.h>
  31   31  #include <sys/spa_impl.h>
  32   32  #include <sys/vdev_impl.h>
  33   33  #include <sys/zio_impl.h>
  34   34  #include <sys/zio_compress.h>
  35   35  #include <sys/zio_checksum.h>
  36   36  #include <sys/dmu_objset.h>
  37   37  #include <sys/arc.h>
  38   38  #include <sys/ddt.h>
  39   39  
  40   40  /*
  41   41   * ==========================================================================
  42   42   * I/O priority table
  43   43   * ==========================================================================
  44   44   */
  45   45  uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  46   46          0,      /* ZIO_PRIORITY_NOW             */
  47   47          0,      /* ZIO_PRIORITY_SYNC_READ       */
  48   48          0,      /* ZIO_PRIORITY_SYNC_WRITE      */
  49   49          0,      /* ZIO_PRIORITY_LOG_WRITE       */
  50   50          1,      /* ZIO_PRIORITY_CACHE_FILL      */
  51   51          1,      /* ZIO_PRIORITY_AGG             */
  52   52          4,      /* ZIO_PRIORITY_FREE            */
  53   53          4,      /* ZIO_PRIORITY_ASYNC_WRITE     */
  54   54          6,      /* ZIO_PRIORITY_ASYNC_READ      */
  55   55          10,     /* ZIO_PRIORITY_RESILVER        */
  56   56          20,     /* ZIO_PRIORITY_SCRUB           */
  57   57          2,      /* ZIO_PRIORITY_DDT_PREFETCH    */
  58   58  };
  59   59  
  60   60  /*
  61   61   * ==========================================================================
  62   62   * I/O type descriptions
  63   63   * ==========================================================================
  64   64   */
  65   65  char *zio_type_name[ZIO_TYPES] = {
  66   66          "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
  67   67          "zio_ioctl"
  68   68  };
  69   69  
  70   70  /*
  71   71   * ==========================================================================
  72   72   * I/O kmem caches
  73   73   * ==========================================================================
  74   74   */
  75   75  kmem_cache_t *zio_cache;
  76   76  kmem_cache_t *zio_link_cache;
  77   77  kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  78   78  kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
  79   79  
  80   80  #ifdef _KERNEL
  81   81  extern vmem_t *zio_alloc_arena;
  82   82  #endif
  83   83  extern int zfs_mg_alloc_failures;
  84   84  
  85   85  /*
  86   86   * An allocating zio is one that either currently has the DVA allocate
  87   87   * stage set or will have it later in its lifetime.
  88   88   */
  89   89  #define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
  90   90  
  91   91  boolean_t       zio_requeue_io_start_cut_in_line = B_TRUE;
  92   92  
  93   93  #ifdef ZFS_DEBUG
  94   94  int zio_buf_debug_limit = 16384;
  95   95  #else
  96   96  int zio_buf_debug_limit = 0;
  97   97  #endif
  98   98  
  99   99  void
 100  100  zio_init(void)
 101  101  {
 102  102          size_t c;
 103  103          vmem_t *data_alloc_arena = NULL;
 104  104  
 105  105  #ifdef _KERNEL
 106  106          data_alloc_arena = zio_alloc_arena;
 107  107  #endif
 108  108          zio_cache = kmem_cache_create("zio_cache",
 109  109              sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 110  110          zio_link_cache = kmem_cache_create("zio_link_cache",
 111  111              sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 112  112  
 113  113          /*
 114  114           * For small buffers, we want a cache for each multiple of
 115  115           * SPA_MINBLOCKSIZE.  For medium-size buffers, we want a cache
 116  116           * for each quarter-power of 2.  For large buffers, we want
 117  117           * a cache for each multiple of PAGESIZE.
 118  118           */
 119  119          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 120  120                  size_t size = (c + 1) << SPA_MINBLOCKSHIFT;
 121  121                  size_t p2 = size;
 122  122                  size_t align = 0;
 123  123                  size_t cflags = (size > zio_buf_debug_limit) ? KMC_NODEBUG : 0;
 124  124  
 125  125                  while (p2 & (p2 - 1))
 126  126                          p2 &= p2 - 1;
 127  127  
 128  128                  if (size <= 4 * SPA_MINBLOCKSIZE) {
 129  129                          align = SPA_MINBLOCKSIZE;
 130  130                  } else if (P2PHASE(size, PAGESIZE) == 0) {
 131  131                          align = PAGESIZE;
 132  132                  } else if (P2PHASE(size, p2 >> 2) == 0) {
 133  133                          align = p2 >> 2;
 134  134                  }
 135  135  
 136  136                  if (align != 0) {
 137  137                          char name[36];
 138  138                          (void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 139  139                          zio_buf_cache[c] = kmem_cache_create(name, size,
 140  140                              align, NULL, NULL, NULL, NULL, NULL, cflags);
 141  141  
 142  142                          /*
 143  143                           * Since zio_data bufs do not appear in crash dumps, we
 144  144                           * pass KMC_NOTOUCH so that no allocator metadata is
 145  145                           * stored with the buffers.
 146  146                           */
 147  147                          (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 148  148                          zio_data_buf_cache[c] = kmem_cache_create(name, size,
 149  149                              align, NULL, NULL, NULL, NULL, data_alloc_arena,
 150  150                              cflags | KMC_NOTOUCH);
 151  151                  }
 152  152          }
 153  153  
 154  154          while (--c != 0) {
 155  155                  ASSERT(zio_buf_cache[c] != NULL);
 156  156                  if (zio_buf_cache[c - 1] == NULL)
 157  157                          zio_buf_cache[c - 1] = zio_buf_cache[c];
 158  158  
 159  159                  ASSERT(zio_data_buf_cache[c] != NULL);
 160  160                  if (zio_data_buf_cache[c - 1] == NULL)
 161  161                          zio_data_buf_cache[c - 1] = zio_data_buf_cache[c];
 162  162          }
 163  163  
 164  164          /*
 165  165           * The zio write taskqs have 1 thread per cpu, allow 1/2 of the taskqs
 166  166           * to fail 3 times per txg or 8 failures, whichever is greater.
 167  167           */
 168  168          zfs_mg_alloc_failures = MAX((3 * max_ncpus / 2), 8);
 169  169  
 170  170          zio_inject_init();
 171  171  }
 172  172  
 173  173  void
 174  174  zio_fini(void)
 175  175  {
 176  176          size_t c;
 177  177          kmem_cache_t *last_cache = NULL;
 178  178          kmem_cache_t *last_data_cache = NULL;
 179  179  
 180  180          for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) {
 181  181                  if (zio_buf_cache[c] != last_cache) {
 182  182                          last_cache = zio_buf_cache[c];
 183  183                          kmem_cache_destroy(zio_buf_cache[c]);
 184  184                  }
 185  185                  zio_buf_cache[c] = NULL;
 186  186  
 187  187                  if (zio_data_buf_cache[c] != last_data_cache) {
 188  188                          last_data_cache = zio_data_buf_cache[c];
 189  189                          kmem_cache_destroy(zio_data_buf_cache[c]);
 190  190                  }
 191  191                  zio_data_buf_cache[c] = NULL;
 192  192          }
 193  193  
 194  194          kmem_cache_destroy(zio_link_cache);
 195  195          kmem_cache_destroy(zio_cache);
 196  196  
 197  197          zio_inject_fini();
 198  198  }
 199  199  
 200  200  /*
 201  201   * ==========================================================================
 202  202   * Allocate and free I/O buffers
 203  203   * ==========================================================================
 204  204   */
 205  205  
 206  206  /*
 207  207   * Use zio_buf_alloc to allocate ZFS metadata.  This data will appear in a
 208  208   * crashdump if the kernel panics, so use it judiciously.  Obviously, it's
 209  209   * useful to inspect ZFS metadata, but if possible, we should avoid keeping
 210  210   * excess / transient data in-core during a crashdump.
 211  211   */
 212  212  void *
 213  213  zio_buf_alloc(size_t size)
 214  214  {
 215  215          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 216  216  
 217  217          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 218  218  
 219  219          return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE));
 220  220  }
 221  221  
 222  222  /*
 223  223   * Use zio_data_buf_alloc to allocate data.  The data will not appear in a
 224  224   * crashdump if the kernel panics.  This exists so that we will limit the amount
 225  225   * of ZFS data that shows up in a kernel crashdump.  (Thus reducing the amount
 226  226   * of kernel heap dumped to disk when the kernel panics)
 227  227   */
 228  228  void *
 229  229  zio_data_buf_alloc(size_t size)
 230  230  {
 231  231          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 232  232  
 233  233          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 234  234  
 235  235          return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE));
 236  236  }
 237  237  
 238  238  void
 239  239  zio_buf_free(void *buf, size_t size)
 240  240  {
 241  241          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 242  242  
 243  243          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 244  244  
 245  245          kmem_cache_free(zio_buf_cache[c], buf);
 246  246  }
 247  247  
 248  248  void
 249  249  zio_data_buf_free(void *buf, size_t size)
 250  250  {
 251  251          size_t c = (size - 1) >> SPA_MINBLOCKSHIFT;
 252  252  
 253  253          ASSERT(c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT);
 254  254  
 255  255          kmem_cache_free(zio_data_buf_cache[c], buf);
 256  256  }
 257  257  
 258  258  /*
 259  259   * ==========================================================================
 260  260   * Push and pop I/O transform buffers
 261  261   * ==========================================================================
 262  262   */
 263  263  static void
 264  264  zio_push_transform(zio_t *zio, void *data, uint64_t size, uint64_t bufsize,
 265  265          zio_transform_func_t *transform)
 266  266  {
 267  267          zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP);
 268  268  
 269  269          zt->zt_orig_data = zio->io_data;
 270  270          zt->zt_orig_size = zio->io_size;
 271  271          zt->zt_bufsize = bufsize;
 272  272          zt->zt_transform = transform;
 273  273  
 274  274          zt->zt_next = zio->io_transform_stack;
 275  275          zio->io_transform_stack = zt;
 276  276  
 277  277          zio->io_data = data;
 278  278          zio->io_size = size;
 279  279  }
 280  280  
 281  281  static void
 282  282  zio_pop_transforms(zio_t *zio)
 283  283  {
 284  284          zio_transform_t *zt;
 285  285  
 286  286          while ((zt = zio->io_transform_stack) != NULL) {
 287  287                  if (zt->zt_transform != NULL)
 288  288                          zt->zt_transform(zio,
 289  289                              zt->zt_orig_data, zt->zt_orig_size);
 290  290  
 291  291                  if (zt->zt_bufsize != 0)
 292  292                          zio_buf_free(zio->io_data, zt->zt_bufsize);
 293  293  
 294  294                  zio->io_data = zt->zt_orig_data;
 295  295                  zio->io_size = zt->zt_orig_size;
 296  296                  zio->io_transform_stack = zt->zt_next;
 297  297  
 298  298                  kmem_free(zt, sizeof (zio_transform_t));
 299  299          }
 300  300  }
 301  301  
 302  302  /*
 303  303   * ==========================================================================
 304  304   * I/O transform callbacks for subblocks and decompression
 305  305   * ==========================================================================
 306  306   */
 307  307  static void
 308  308  zio_subblock(zio_t *zio, void *data, uint64_t size)
 309  309  {
 310  310          ASSERT(zio->io_size > size);
 311  311  
 312  312          if (zio->io_type == ZIO_TYPE_READ)
 313  313                  bcopy(zio->io_data, data, size);
 314  314  }
 315  315  
 316  316  static void
 317  317  zio_decompress(zio_t *zio, void *data, uint64_t size)
 318  318  {
 319  319          if (zio->io_error == 0 &&
 320  320              zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
 321  321              zio->io_data, data, zio->io_size, size) != 0)
 322  322                  zio->io_error = EIO;
 323  323  }
 324  324  
 325  325  /*
 326  326   * ==========================================================================
 327  327   * I/O parent/child relationships and pipeline interlocks
 328  328   * ==========================================================================
 329  329   */
 330  330  /*
 331  331   * NOTE - Callers to zio_walk_parents() and zio_walk_children must
 332  332   *        continue calling these functions until they return NULL.
 333  333   *        Otherwise, the next caller will pick up the list walk in
 334  334   *        some indeterminate state.  (Otherwise every caller would
 335  335   *        have to pass in a cookie to keep the state represented by
 336  336   *        io_walk_link, which gets annoying.)
 337  337   */
 338  338  zio_t *
 339  339  zio_walk_parents(zio_t *cio)
 340  340  {
 341  341          zio_link_t *zl = cio->io_walk_link;
 342  342          list_t *pl = &cio->io_parent_list;
 343  343  
 344  344          zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
 345  345          cio->io_walk_link = zl;
 346  346  
 347  347          if (zl == NULL)
 348  348                  return (NULL);
 349  349  
 350  350          ASSERT(zl->zl_child == cio);
 351  351          return (zl->zl_parent);
 352  352  }
 353  353  
 354  354  zio_t *
 355  355  zio_walk_children(zio_t *pio)
 356  356  {
 357  357          zio_link_t *zl = pio->io_walk_link;
 358  358          list_t *cl = &pio->io_child_list;
 359  359  
 360  360          zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
 361  361          pio->io_walk_link = zl;
 362  362  
 363  363          if (zl == NULL)
 364  364                  return (NULL);
 365  365  
 366  366          ASSERT(zl->zl_parent == pio);
 367  367          return (zl->zl_child);
 368  368  }
 369  369  
 370  370  zio_t *
 371  371  zio_unique_parent(zio_t *cio)
 372  372  {
 373  373          zio_t *pio = zio_walk_parents(cio);
 374  374  
 375  375          VERIFY(zio_walk_parents(cio) == NULL);
 376  376          return (pio);
 377  377  }
 378  378  
 379  379  void
 380  380  zio_add_child(zio_t *pio, zio_t *cio)
 381  381  {
 382  382          zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
 383  383  
 384  384          /*
 385  385           * Logical I/Os can have logical, gang, or vdev children.
 386  386           * Gang I/Os can have gang or vdev children.
 387  387           * Vdev I/Os can only have vdev children.
 388  388           * The following ASSERT captures all of these constraints.
 389  389           */
 390  390          ASSERT(cio->io_child_type <= pio->io_child_type);
 391  391  
 392  392          zl->zl_parent = pio;
 393  393          zl->zl_child = cio;
 394  394  
 395  395          mutex_enter(&cio->io_lock);
 396  396          mutex_enter(&pio->io_lock);
 397  397  
 398  398          ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
 399  399  
 400  400          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
 401  401                  pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
 402  402  
 403  403          list_insert_head(&pio->io_child_list, zl);
 404  404          list_insert_head(&cio->io_parent_list, zl);
 405  405  
 406  406          pio->io_child_count++;
 407  407          cio->io_parent_count++;
 408  408  
 409  409          mutex_exit(&pio->io_lock);
 410  410          mutex_exit(&cio->io_lock);
 411  411  }
 412  412  
 413  413  static void
 414  414  zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 415  415  {
 416  416          ASSERT(zl->zl_parent == pio);
 417  417          ASSERT(zl->zl_child == cio);
 418  418  
 419  419          mutex_enter(&cio->io_lock);
 420  420          mutex_enter(&pio->io_lock);
 421  421  
 422  422          list_remove(&pio->io_child_list, zl);
 423  423          list_remove(&cio->io_parent_list, zl);
 424  424  
 425  425          pio->io_child_count--;
 426  426          cio->io_parent_count--;
 427  427  
 428  428          mutex_exit(&pio->io_lock);
 429  429          mutex_exit(&cio->io_lock);
 430  430  
 431  431          kmem_cache_free(zio_link_cache, zl);
 432  432  }
 433  433  
 434  434  static boolean_t
 435  435  zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 436  436  {
 437  437          uint64_t *countp = &zio->io_children[child][wait];
 438  438          boolean_t waiting = B_FALSE;
 439  439  
 440  440          mutex_enter(&zio->io_lock);
 441  441          ASSERT(zio->io_stall == NULL);
 442  442          if (*countp != 0) {
 443  443                  zio->io_stage >>= 1;
 444  444                  zio->io_stall = countp;
 445  445                  waiting = B_TRUE;
 446  446          }
 447  447          mutex_exit(&zio->io_lock);
 448  448  
 449  449          return (waiting);
 450  450  }
 451  451  
 452  452  static void
 453  453  zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait)
 454  454  {
 455  455          uint64_t *countp = &pio->io_children[zio->io_child_type][wait];
 456  456          int *errorp = &pio->io_child_error[zio->io_child_type];
 457  457  
 458  458          mutex_enter(&pio->io_lock);
 459  459          if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
 460  460                  *errorp = zio_worst_error(*errorp, zio->io_error);
 461  461          pio->io_reexecute |= zio->io_reexecute;
 462  462          ASSERT3U(*countp, >, 0);
 463  463          if (--*countp == 0 && pio->io_stall == countp) {
 464  464                  pio->io_stall = NULL;
 465  465                  mutex_exit(&pio->io_lock);
 466  466                  zio_execute(pio);
 467  467          } else {
 468  468                  mutex_exit(&pio->io_lock);
 469  469          }
 470  470  }
 471  471  
 472  472  static void
 473  473  zio_inherit_child_errors(zio_t *zio, enum zio_child c)
 474  474  {
 475  475          if (zio->io_child_error[c] != 0 && zio->io_error == 0)
 476  476                  zio->io_error = zio->io_child_error[c];
 477  477  }
 478  478  
 479  479  /*
 480  480   * ==========================================================================
 481  481   * Create the various types of I/O (read, write, free, etc)
 482  482   * ==========================================================================
 483  483   */
 484  484  static zio_t *
 485  485  zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 486  486      void *data, uint64_t size, zio_done_func_t *done, void *private,
 487  487      zio_type_t type, int priority, enum zio_flag flags,
 488  488      vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
 489  489      enum zio_stage stage, enum zio_stage pipeline)
 490  490  {
 491  491          zio_t *zio;
 492  492  
 493  493          ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 494  494          ASSERT(P2PHASE(size, SPA_MINBLOCKSIZE) == 0);
 495  495          ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0);
 496  496  
 497  497          ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER));
 498  498          ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER));
 499  499          ASSERT(vd || stage == ZIO_STAGE_OPEN);
 500  500  
 501  501          zio = kmem_cache_alloc(zio_cache, KM_SLEEP);
 502  502          bzero(zio, sizeof (zio_t));
 503  503  
 504  504          mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 505  505          cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 506  506  
 507  507          list_create(&zio->io_parent_list, sizeof (zio_link_t),
 508  508              offsetof(zio_link_t, zl_parent_node));
 509  509          list_create(&zio->io_child_list, sizeof (zio_link_t),
 510  510              offsetof(zio_link_t, zl_child_node));
 511  511  
 512  512          if (vd != NULL)
 513  513                  zio->io_child_type = ZIO_CHILD_VDEV;
 514  514          else if (flags & ZIO_FLAG_GANG_CHILD)
 515  515                  zio->io_child_type = ZIO_CHILD_GANG;
 516  516          else if (flags & ZIO_FLAG_DDT_CHILD)
 517  517                  zio->io_child_type = ZIO_CHILD_DDT;
 518  518          else
 519  519                  zio->io_child_type = ZIO_CHILD_LOGICAL;
 520  520  
 521  521          if (bp != NULL) {
 522  522                  zio->io_bp = (blkptr_t *)bp;
 523  523                  zio->io_bp_copy = *bp;
 524  524                  zio->io_bp_orig = *bp;
 525  525                  if (type != ZIO_TYPE_WRITE ||
 526  526                      zio->io_child_type == ZIO_CHILD_DDT)
 527  527                          zio->io_bp = &zio->io_bp_copy;  /* so caller can free */
 528  528                  if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 529  529                          zio->io_logical = zio;
 530  530                  if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
 531  531                          pipeline |= ZIO_GANG_STAGES;
 532  532          }
 533  533  
 534  534          zio->io_spa = spa;
 535  535          zio->io_txg = txg;
 536  536          zio->io_done = done;
 537  537          zio->io_private = private;
 538  538          zio->io_type = type;
 539  539          zio->io_priority = priority;
 540  540          zio->io_vd = vd;
 541  541          zio->io_offset = offset;
 542  542          zio->io_orig_data = zio->io_data = data;
 543  543          zio->io_orig_size = zio->io_size = size;
 544  544          zio->io_orig_flags = zio->io_flags = flags;
 545  545          zio->io_orig_stage = zio->io_stage = stage;
 546  546          zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 547  547  
 548  548          zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
 549  549          zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
 550  550  
 551  551          if (zb != NULL)
 552  552                  zio->io_bookmark = *zb;
 553  553  
 554  554          if (pio != NULL) {
 555  555                  if (zio->io_logical == NULL)
 556  556                          zio->io_logical = pio->io_logical;
 557  557                  if (zio->io_child_type == ZIO_CHILD_GANG)
 558  558                          zio->io_gang_leader = pio->io_gang_leader;
 559  559                  zio_add_child(pio, zio);
 560  560          }
 561  561  
 562  562          return (zio);
 563  563  }
 564  564  
 565  565  static void
 566  566  zio_destroy(zio_t *zio)
 567  567  {
 568  568          list_destroy(&zio->io_parent_list);
 569  569          list_destroy(&zio->io_child_list);
 570  570          mutex_destroy(&zio->io_lock);
 571  571          cv_destroy(&zio->io_cv);
 572  572          kmem_cache_free(zio_cache, zio);
 573  573  }
 574  574  
 575  575  zio_t *
 576  576  zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
 577  577      void *private, enum zio_flag flags)
 578  578  {
 579  579          zio_t *zio;
 580  580  
 581  581          zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 582  582              ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 583  583              ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 584  584  
 585  585          return (zio);
 586  586  }
 587  587  
 588  588  zio_t *
 589  589  zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 590  590  {
 591  591          return (zio_null(NULL, spa, NULL, done, private, flags));
 592  592  }
 593  593  
 594  594  zio_t *
 595  595  zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
 596  596      void *data, uint64_t size, zio_done_func_t *done, void *private,
 597  597      int priority, enum zio_flag flags, const zbookmark_t *zb)
 598  598  {
 599  599          zio_t *zio;
 600  600  
 601  601          zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 602  602              data, size, done, private,
 603  603              ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
 604  604              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 605  605              ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 606  606  
 607  607          return (zio);
 608  608  }
 609  609  
 610  610  zio_t *
 611  611  zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
  
    | 
      ↓ open down ↓ | 
    611 lines elided | 
    
      ↑ open up ↑ | 
  
 612  612      void *data, uint64_t size, const zio_prop_t *zp,
 613  613      zio_done_func_t *ready, zio_done_func_t *done, void *private,
 614  614      int priority, enum zio_flag flags, const zbookmark_t *zb)
 615  615  {
 616  616          zio_t *zio;
 617  617  
 618  618          ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF &&
 619  619              zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS &&
 620  620              zp->zp_compress >= ZIO_COMPRESS_OFF &&
 621  621              zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 622      -            zp->zp_type < DMU_OT_NUMTYPES &&
      622 +            DMU_OT_IS_VALID(zp->zp_type) &&
 623  623              zp->zp_level < 32 &&
 624  624              zp->zp_copies > 0 &&
 625  625              zp->zp_copies <= spa_max_replication(spa) &&
 626  626              zp->zp_dedup <= 1 &&
 627  627              zp->zp_dedup_verify <= 1);
 628  628  
 629  629          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 630  630              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 631  631              ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
 632  632              ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 633  633  
 634  634          zio->io_ready = ready;
 635  635          zio->io_prop = *zp;
 636  636  
 637  637          return (zio);
 638  638  }
 639  639  
 640  640  zio_t *
 641  641  zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 642  642      uint64_t size, zio_done_func_t *done, void *private, int priority,
 643  643      enum zio_flag flags, zbookmark_t *zb)
 644  644  {
 645  645          zio_t *zio;
 646  646  
 647  647          zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 648  648              ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
 649  649              ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE);
 650  650  
 651  651          return (zio);
 652  652  }
 653  653  
 654  654  void
 655  655  zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
 656  656  {
 657  657          ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 658  658          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 659  659          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
 660  660          ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
 661  661  
 662  662          zio->io_prop.zp_copies = copies;
 663  663          zio->io_bp_override = bp;
 664  664  }
 665  665  
 666  666  void
 667  667  zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
 668  668  {
 669  669          bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp);
 670  670  }
 671  671  
 672  672  zio_t *
 673  673  zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 674  674      enum zio_flag flags)
 675  675  {
 676  676          zio_t *zio;
 677  677  
 678  678          dprintf_bp(bp, "freeing in txg %llu, pass %u",
 679  679              (longlong_t)txg, spa->spa_sync_pass);
 680  680  
 681  681          ASSERT(!BP_IS_HOLE(bp));
 682  682          ASSERT(spa_syncing_txg(spa) == txg);
 683  683          ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
 684  684  
 685  685          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 686  686              NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
 687  687              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 688  688  
 689  689          return (zio);
 690  690  }
 691  691  
 692  692  zio_t *
 693  693  zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
 694  694      zio_done_func_t *done, void *private, enum zio_flag flags)
 695  695  {
 696  696          zio_t *zio;
 697  697  
 698  698          /*
 699  699           * A claim is an allocation of a specific block.  Claims are needed
 700  700           * to support immediate writes in the intent log.  The issue is that
 701  701           * immediate writes contain committed data, but in a txg that was
 702  702           * *not* committed.  Upon opening the pool after an unclean shutdown,
 703  703           * the intent log claims all blocks that contain immediate write data
 704  704           * so that the SPA knows they're in use.
 705  705           *
 706  706           * All claims *must* be resolved in the first txg -- before the SPA
 707  707           * starts allocating blocks -- so that nothing is allocated twice.
 708  708           * If txg == 0 we just verify that the block is claimable.
 709  709           */
 710  710          ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
 711  711          ASSERT(txg == spa_first_txg(spa) || txg == 0);
 712  712          ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));       /* zdb(1M) */
 713  713  
 714  714          zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 715  715              done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
 716  716              NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE);
 717  717  
 718  718          return (zio);
 719  719  }
 720  720  
 721  721  zio_t *
 722  722  zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 723  723      zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
 724  724  {
 725  725          zio_t *zio;
 726  726          int c;
 727  727  
 728  728          if (vd->vdev_children == 0) {
 729  729                  zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
 730  730                      ZIO_TYPE_IOCTL, priority, flags, vd, 0, NULL,
 731  731                      ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE);
 732  732  
 733  733                  zio->io_cmd = cmd;
 734  734          } else {
 735  735                  zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 736  736  
 737  737                  for (c = 0; c < vd->vdev_children; c++)
 738  738                          zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
 739  739                              done, private, priority, flags));
 740  740          }
 741  741  
 742  742          return (zio);
 743  743  }
 744  744  
 745  745  zio_t *
 746  746  zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 747  747      void *data, int checksum, zio_done_func_t *done, void *private,
 748  748      int priority, enum zio_flag flags, boolean_t labels)
 749  749  {
 750  750          zio_t *zio;
 751  751  
 752  752          ASSERT(vd->vdev_children == 0);
 753  753          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 754  754              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 755  755          ASSERT3U(offset + size, <=, vd->vdev_psize);
 756  756  
 757  757          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 758  758              ZIO_TYPE_READ, priority, flags, vd, offset, NULL,
 759  759              ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE);
 760  760  
 761  761          zio->io_prop.zp_checksum = checksum;
 762  762  
 763  763          return (zio);
 764  764  }
 765  765  
 766  766  zio_t *
 767  767  zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 768  768      void *data, int checksum, zio_done_func_t *done, void *private,
 769  769      int priority, enum zio_flag flags, boolean_t labels)
 770  770  {
 771  771          zio_t *zio;
 772  772  
 773  773          ASSERT(vd->vdev_children == 0);
 774  774          ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE ||
 775  775              offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE);
 776  776          ASSERT3U(offset + size, <=, vd->vdev_psize);
 777  777  
 778  778          zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, done, private,
 779  779              ZIO_TYPE_WRITE, priority, flags, vd, offset, NULL,
 780  780              ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE);
 781  781  
 782  782          zio->io_prop.zp_checksum = checksum;
 783  783  
 784  784          if (zio_checksum_table[checksum].ci_eck) {
 785  785                  /*
 786  786                   * zec checksums are necessarily destructive -- they modify
 787  787                   * the end of the write buffer to hold the verifier/checksum.
 788  788                   * Therefore, we must make a local copy in case the data is
 789  789                   * being written to multiple places in parallel.
 790  790                   */
 791  791                  void *wbuf = zio_buf_alloc(size);
 792  792                  bcopy(data, wbuf, size);
 793  793                  zio_push_transform(zio, wbuf, size, size, NULL);
 794  794          }
 795  795  
 796  796          return (zio);
 797  797  }
 798  798  
 799  799  /*
 800  800   * Create a child I/O to do some work for us.
 801  801   */
 802  802  zio_t *
 803  803  zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 804  804          void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 805  805          zio_done_func_t *done, void *private)
 806  806  {
 807  807          enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 808  808          zio_t *zio;
 809  809  
 810  810          ASSERT(vd->vdev_parent ==
 811  811              (pio->io_vd ? pio->io_vd : pio->io_spa->spa_root_vdev));
 812  812  
 813  813          if (type == ZIO_TYPE_READ && bp != NULL) {
 814  814                  /*
 815  815                   * If we have the bp, then the child should perform the
 816  816                   * checksum and the parent need not.  This pushes error
 817  817                   * detection as close to the leaves as possible and
 818  818                   * eliminates redundant checksums in the interior nodes.
 819  819                   */
 820  820                  pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
 821  821                  pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 822  822          }
 823  823  
 824  824          if (vd->vdev_children == 0)
 825  825                  offset += VDEV_LABEL_START_SIZE;
 826  826  
 827  827          flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
 828  828  
 829  829          /*
 830  830           * If we've decided to do a repair, the write is not speculative --
 831  831           * even if the original read was.
 832  832           */
 833  833          if (flags & ZIO_FLAG_IO_REPAIR)
 834  834                  flags &= ~ZIO_FLAG_SPECULATIVE;
 835  835  
 836  836          zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
 837  837              done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
 838  838              ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 839  839  
 840  840          return (zio);
 841  841  }
 842  842  
 843  843  zio_t *
 844  844  zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 845  845          int type, int priority, enum zio_flag flags,
 846  846          zio_done_func_t *done, void *private)
 847  847  {
 848  848          zio_t *zio;
 849  849  
 850  850          ASSERT(vd->vdev_ops->vdev_op_leaf);
 851  851  
 852  852          zio = zio_create(NULL, vd->vdev_spa, 0, NULL,
 853  853              data, size, done, private, type, priority,
 854  854              flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
 855  855              vd, offset, NULL,
 856  856              ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 857  857  
 858  858          return (zio);
 859  859  }
 860  860  
 861  861  void
 862  862  zio_flush(zio_t *zio, vdev_t *vd)
 863  863  {
 864  864          zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE,
 865  865              NULL, NULL, ZIO_PRIORITY_NOW,
 866  866              ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 867  867  }
 868  868  
 869  869  void
 870  870  zio_shrink(zio_t *zio, uint64_t size)
 871  871  {
 872  872          ASSERT(zio->io_executor == NULL);
 873  873          ASSERT(zio->io_orig_size == zio->io_size);
 874  874          ASSERT(size <= zio->io_size);
 875  875  
 876  876          /*
 877  877           * We don't shrink for raidz because of problems with the
 878  878           * reconstruction when reading back less than the block size.
 879  879           * Note, BP_IS_RAIDZ() assumes no compression.
 880  880           */
 881  881          ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
 882  882          if (!BP_IS_RAIDZ(zio->io_bp))
 883  883                  zio->io_orig_size = zio->io_size = size;
 884  884  }
 885  885  
 886  886  /*
 887  887   * ==========================================================================
 888  888   * Prepare to read and write logical blocks
 889  889   * ==========================================================================
 890  890   */
 891  891  
 892  892  static int
 893  893  zio_read_bp_init(zio_t *zio)
 894  894  {
 895  895          blkptr_t *bp = zio->io_bp;
  
    | 
      ↓ open down ↓ | 
    263 lines elided | 
    
      ↑ open up ↑ | 
  
 896  896  
 897  897          if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
 898  898              zio->io_child_type == ZIO_CHILD_LOGICAL &&
 899  899              !(zio->io_flags & ZIO_FLAG_RAW)) {
 900  900                  uint64_t psize = BP_GET_PSIZE(bp);
 901  901                  void *cbuf = zio_buf_alloc(psize);
 902  902  
 903  903                  zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 904  904          }
 905  905  
 906      -        if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
      906 +        if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0)
 907  907                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 908  908  
 909  909          if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
 910  910                  zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 911  911  
 912  912          if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
 913  913                  zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
 914  914  
 915  915          return (ZIO_PIPELINE_CONTINUE);
 916  916  }
 917  917  
 918  918  static int
 919  919  zio_write_bp_init(zio_t *zio)
 920  920  {
 921  921          spa_t *spa = zio->io_spa;
 922  922          zio_prop_t *zp = &zio->io_prop;
 923  923          enum zio_compress compress = zp->zp_compress;
 924  924          blkptr_t *bp = zio->io_bp;
 925  925          uint64_t lsize = zio->io_size;
 926  926          uint64_t psize = lsize;
 927  927          int pass = 1;
 928  928  
 929  929          /*
 930  930           * If our children haven't all reached the ready stage,
 931  931           * wait for them and then repeat this pipeline stage.
 932  932           */
 933  933          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
 934  934              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_READY))
 935  935                  return (ZIO_PIPELINE_STOP);
 936  936  
 937  937          if (!IO_IS_ALLOCATING(zio))
 938  938                  return (ZIO_PIPELINE_CONTINUE);
 939  939  
 940  940          ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
 941  941  
 942  942          if (zio->io_bp_override) {
 943  943                  ASSERT(bp->blk_birth != zio->io_txg);
 944  944                  ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
 945  945  
 946  946                  *bp = *zio->io_bp_override;
 947  947                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 948  948  
 949  949                  if (BP_IS_HOLE(bp) || !zp->zp_dedup)
 950  950                          return (ZIO_PIPELINE_CONTINUE);
 951  951  
 952  952                  ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
 953  953                      zp->zp_dedup_verify);
 954  954  
 955  955                  if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
 956  956                          BP_SET_DEDUP(bp, 1);
 957  957                          zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
 958  958                          return (ZIO_PIPELINE_CONTINUE);
 959  959                  }
 960  960                  zio->io_bp_override = NULL;
 961  961                  BP_ZERO(bp);
 962  962          }
 963  963  
 964  964          if (bp->blk_birth == zio->io_txg) {
 965  965                  /*
 966  966                   * We're rewriting an existing block, which means we're
 967  967                   * working on behalf of spa_sync().  For spa_sync() to
 968  968                   * converge, it must eventually be the case that we don't
 969  969                   * have to allocate new blocks.  But compression changes
 970  970                   * the blocksize, which forces a reallocate, and makes
 971  971                   * convergence take longer.  Therefore, after the first
 972  972                   * few passes, stop compressing to ensure convergence.
 973  973                   */
 974  974                  pass = spa_sync_pass(spa);
 975  975  
 976  976                  ASSERT(zio->io_txg == spa_syncing_txg(spa));
 977  977                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 978  978                  ASSERT(!BP_GET_DEDUP(bp));
 979  979  
 980  980                  if (pass > SYNC_PASS_DONT_COMPRESS)
 981  981                          compress = ZIO_COMPRESS_OFF;
 982  982  
 983  983                  /* Make sure someone doesn't change their mind on overwrites */
 984  984                  ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
 985  985                      spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 986  986          }
 987  987  
 988  988          if (compress != ZIO_COMPRESS_OFF) {
 989  989                  void *cbuf = zio_buf_alloc(lsize);
 990  990                  psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
 991  991                  if (psize == 0 || psize == lsize) {
 992  992                          compress = ZIO_COMPRESS_OFF;
 993  993                          zio_buf_free(cbuf, lsize);
 994  994                  } else {
 995  995                          ASSERT(psize < lsize);
 996  996                          zio_push_transform(zio, cbuf, psize, lsize, NULL);
 997  997                  }
 998  998          }
 999  999  
1000 1000          /*
1001 1001           * The final pass of spa_sync() must be all rewrites, but the first
1002 1002           * few passes offer a trade-off: allocating blocks defers convergence,
1003 1003           * but newly allocated blocks are sequential, so they can be written
1004 1004           * to disk faster.  Therefore, we allow the first few passes of
1005 1005           * spa_sync() to allocate new blocks, but force rewrites after that.
1006 1006           * There should only be a handful of blocks after pass 1 in any case.
1007 1007           */
1008 1008          if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
1009 1009              pass > SYNC_PASS_REWRITE) {
1010 1010                  ASSERT(psize != 0);
1011 1011                  enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
1012 1012                  zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
1013 1013                  zio->io_flags |= ZIO_FLAG_IO_REWRITE;
1014 1014          } else {
1015 1015                  BP_ZERO(bp);
1016 1016                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
1017 1017          }
1018 1018  
1019 1019          if (psize == 0) {
1020 1020                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1021 1021          } else {
1022 1022                  ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
1023 1023                  BP_SET_LSIZE(bp, lsize);
1024 1024                  BP_SET_PSIZE(bp, psize);
1025 1025                  BP_SET_COMPRESS(bp, compress);
1026 1026                  BP_SET_CHECKSUM(bp, zp->zp_checksum);
1027 1027                  BP_SET_TYPE(bp, zp->zp_type);
1028 1028                  BP_SET_LEVEL(bp, zp->zp_level);
1029 1029                  BP_SET_DEDUP(bp, zp->zp_dedup);
1030 1030                  BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
1031 1031                  if (zp->zp_dedup) {
1032 1032                          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1033 1033                          ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
1034 1034                          zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
1035 1035                  }
1036 1036          }
1037 1037  
1038 1038          return (ZIO_PIPELINE_CONTINUE);
1039 1039  }
1040 1040  
1041 1041  static int
1042 1042  zio_free_bp_init(zio_t *zio)
1043 1043  {
1044 1044          blkptr_t *bp = zio->io_bp;
1045 1045  
1046 1046          if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
1047 1047                  if (BP_GET_DEDUP(bp))
1048 1048                          zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
1049 1049          }
1050 1050  
1051 1051          return (ZIO_PIPELINE_CONTINUE);
1052 1052  }
1053 1053  
1054 1054  /*
1055 1055   * ==========================================================================
1056 1056   * Execute the I/O pipeline
1057 1057   * ==========================================================================
1058 1058   */
1059 1059  
1060 1060  static void
1061 1061  zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
1062 1062  {
1063 1063          spa_t *spa = zio->io_spa;
1064 1064          zio_type_t t = zio->io_type;
1065 1065          int flags = (cutinline ? TQ_FRONT : 0);
1066 1066  
1067 1067          /*
1068 1068           * If we're a config writer or a probe, the normal issue and
1069 1069           * interrupt threads may all be blocked waiting for the config lock.
1070 1070           * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
1071 1071           */
1072 1072          if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
1073 1073                  t = ZIO_TYPE_NULL;
1074 1074  
1075 1075          /*
1076 1076           * A similar issue exists for the L2ARC write thread until L2ARC 2.0.
1077 1077           */
1078 1078          if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
1079 1079                  t = ZIO_TYPE_NULL;
1080 1080  
1081 1081          /*
1082 1082           * If this is a high priority I/O, then use the high priority taskq.
1083 1083           */
1084 1084          if (zio->io_priority == ZIO_PRIORITY_NOW &&
1085 1085              spa->spa_zio_taskq[t][q + 1] != NULL)
1086 1086                  q++;
1087 1087  
1088 1088          ASSERT3U(q, <, ZIO_TASKQ_TYPES);
1089 1089  
1090 1090          /*
1091 1091           * NB: We are assuming that the zio can only be dispatched
1092 1092           * to a single taskq at a time.  It would be a grievous error
1093 1093           * to dispatch the zio to another taskq at the same time.
1094 1094           */
1095 1095          ASSERT(zio->io_tqent.tqent_next == NULL);
1096 1096          taskq_dispatch_ent(spa->spa_zio_taskq[t][q],
1097 1097              (task_func_t *)zio_execute, zio, flags, &zio->io_tqent);
1098 1098  }
1099 1099  
1100 1100  static boolean_t
1101 1101  zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
1102 1102  {
1103 1103          kthread_t *executor = zio->io_executor;
1104 1104          spa_t *spa = zio->io_spa;
1105 1105  
1106 1106          for (zio_type_t t = 0; t < ZIO_TYPES; t++)
1107 1107                  if (taskq_member(spa->spa_zio_taskq[t][q], executor))
1108 1108                          return (B_TRUE);
1109 1109  
1110 1110          return (B_FALSE);
1111 1111  }
1112 1112  
1113 1113  static int
1114 1114  zio_issue_async(zio_t *zio)
1115 1115  {
1116 1116          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1117 1117  
1118 1118          return (ZIO_PIPELINE_STOP);
1119 1119  }
1120 1120  
1121 1121  void
1122 1122  zio_interrupt(zio_t *zio)
1123 1123  {
1124 1124          zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
1125 1125  }
1126 1126  
1127 1127  /*
1128 1128   * Execute the I/O pipeline until one of the following occurs:
1129 1129   * (1) the I/O completes; (2) the pipeline stalls waiting for
1130 1130   * dependent child I/Os; (3) the I/O issues, so we're waiting
1131 1131   * for an I/O completion interrupt; (4) the I/O is delegated by
1132 1132   * vdev-level caching or aggregation; (5) the I/O is deferred
1133 1133   * due to vdev-level queueing; (6) the I/O is handed off to
1134 1134   * another thread.  In all cases, the pipeline stops whenever
1135 1135   * there's no CPU work; it never burns a thread in cv_wait().
1136 1136   *
1137 1137   * There's no locking on io_stage because there's no legitimate way
1138 1138   * for multiple threads to be attempting to process the same I/O.
1139 1139   */
1140 1140  static zio_pipe_stage_t *zio_pipeline[];
1141 1141  
1142 1142  void
1143 1143  zio_execute(zio_t *zio)
1144 1144  {
1145 1145          zio->io_executor = curthread;
1146 1146  
1147 1147          while (zio->io_stage < ZIO_STAGE_DONE) {
1148 1148                  enum zio_stage pipeline = zio->io_pipeline;
1149 1149                  enum zio_stage stage = zio->io_stage;
1150 1150                  int rv;
1151 1151  
1152 1152                  ASSERT(!MUTEX_HELD(&zio->io_lock));
1153 1153                  ASSERT(ISP2(stage));
1154 1154                  ASSERT(zio->io_stall == NULL);
1155 1155  
1156 1156                  do {
1157 1157                          stage <<= 1;
1158 1158                  } while ((stage & pipeline) == 0);
1159 1159  
1160 1160                  ASSERT(stage <= ZIO_STAGE_DONE);
1161 1161  
1162 1162                  /*
1163 1163                   * If we are in interrupt context and this pipeline stage
1164 1164                   * will grab a config lock that is held across I/O,
1165 1165                   * or may wait for an I/O that needs an interrupt thread
1166 1166                   * to complete, issue async to avoid deadlock.
1167 1167                   *
1168 1168                   * For VDEV_IO_START, we cut in line so that the io will
1169 1169                   * be sent to disk promptly.
1170 1170                   */
1171 1171                  if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
1172 1172                      zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
1173 1173                          boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
1174 1174                              zio_requeue_io_start_cut_in_line : B_FALSE;
1175 1175                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
1176 1176                          return;
1177 1177                  }
1178 1178  
1179 1179                  zio->io_stage = stage;
1180 1180                  rv = zio_pipeline[highbit(stage) - 1](zio);
1181 1181  
1182 1182                  if (rv == ZIO_PIPELINE_STOP)
1183 1183                          return;
1184 1184  
1185 1185                  ASSERT(rv == ZIO_PIPELINE_CONTINUE);
1186 1186          }
1187 1187  }
1188 1188  
1189 1189  /*
1190 1190   * ==========================================================================
1191 1191   * Initiate I/O, either sync or async
1192 1192   * ==========================================================================
1193 1193   */
1194 1194  int
1195 1195  zio_wait(zio_t *zio)
1196 1196  {
1197 1197          int error;
1198 1198  
1199 1199          ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
1200 1200          ASSERT(zio->io_executor == NULL);
1201 1201  
1202 1202          zio->io_waiter = curthread;
1203 1203  
1204 1204          zio_execute(zio);
1205 1205  
1206 1206          mutex_enter(&zio->io_lock);
1207 1207          while (zio->io_executor != NULL)
1208 1208                  cv_wait(&zio->io_cv, &zio->io_lock);
1209 1209          mutex_exit(&zio->io_lock);
1210 1210  
1211 1211          error = zio->io_error;
1212 1212          zio_destroy(zio);
1213 1213  
1214 1214          return (error);
1215 1215  }
1216 1216  
1217 1217  void
1218 1218  zio_nowait(zio_t *zio)
1219 1219  {
1220 1220          ASSERT(zio->io_executor == NULL);
1221 1221  
1222 1222          if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
1223 1223              zio_unique_parent(zio) == NULL) {
1224 1224                  /*
1225 1225                   * This is a logical async I/O with no parent to wait for it.
1226 1226                   * We add it to the spa_async_root_zio "Godfather" I/O which
1227 1227                   * will ensure they complete prior to unloading the pool.
1228 1228                   */
1229 1229                  spa_t *spa = zio->io_spa;
1230 1230  
1231 1231                  zio_add_child(spa->spa_async_zio_root, zio);
1232 1232          }
1233 1233  
1234 1234          zio_execute(zio);
1235 1235  }
1236 1236  
1237 1237  /*
1238 1238   * ==========================================================================
1239 1239   * Reexecute or suspend/resume failed I/O
1240 1240   * ==========================================================================
1241 1241   */
1242 1242  
1243 1243  static void
1244 1244  zio_reexecute(zio_t *pio)
1245 1245  {
1246 1246          zio_t *cio, *cio_next;
1247 1247  
1248 1248          ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
1249 1249          ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
1250 1250          ASSERT(pio->io_gang_leader == NULL);
1251 1251          ASSERT(pio->io_gang_tree == NULL);
1252 1252  
1253 1253          pio->io_flags = pio->io_orig_flags;
1254 1254          pio->io_stage = pio->io_orig_stage;
1255 1255          pio->io_pipeline = pio->io_orig_pipeline;
1256 1256          pio->io_reexecute = 0;
1257 1257          pio->io_error = 0;
1258 1258          for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1259 1259                  pio->io_state[w] = 0;
1260 1260          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
1261 1261                  pio->io_child_error[c] = 0;
1262 1262  
1263 1263          if (IO_IS_ALLOCATING(pio))
1264 1264                  BP_ZERO(pio->io_bp);
1265 1265  
1266 1266          /*
1267 1267           * As we reexecute pio's children, new children could be created.
1268 1268           * New children go to the head of pio's io_child_list, however,
1269 1269           * so we will (correctly) not reexecute them.  The key is that
1270 1270           * the remainder of pio's io_child_list, from 'cio_next' onward,
1271 1271           * cannot be affected by any side effects of reexecuting 'cio'.
1272 1272           */
1273 1273          for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
1274 1274                  cio_next = zio_walk_children(pio);
1275 1275                  mutex_enter(&pio->io_lock);
1276 1276                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
1277 1277                          pio->io_children[cio->io_child_type][w]++;
1278 1278                  mutex_exit(&pio->io_lock);
1279 1279                  zio_reexecute(cio);
1280 1280          }
1281 1281  
1282 1282          /*
1283 1283           * Now that all children have been reexecuted, execute the parent.
1284 1284           * We don't reexecute "The Godfather" I/O here as it's the
1285 1285           * responsibility of the caller to wait on him.
1286 1286           */
1287 1287          if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
1288 1288                  zio_execute(pio);
1289 1289  }
1290 1290  
1291 1291  void
1292 1292  zio_suspend(spa_t *spa, zio_t *zio)
1293 1293  {
1294 1294          if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC)
1295 1295                  fm_panic("Pool '%s' has encountered an uncorrectable I/O "
1296 1296                      "failure and the failure mode property for this pool "
1297 1297                      "is set to panic.", spa_name(spa));
1298 1298  
1299 1299          zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, NULL, 0, 0);
1300 1300  
1301 1301          mutex_enter(&spa->spa_suspend_lock);
1302 1302  
1303 1303          if (spa->spa_suspend_zio_root == NULL)
1304 1304                  spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
1305 1305                      ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
1306 1306                      ZIO_FLAG_GODFATHER);
1307 1307  
1308 1308          spa->spa_suspended = B_TRUE;
1309 1309  
1310 1310          if (zio != NULL) {
1311 1311                  ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
1312 1312                  ASSERT(zio != spa->spa_suspend_zio_root);
1313 1313                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1314 1314                  ASSERT(zio_unique_parent(zio) == NULL);
1315 1315                  ASSERT(zio->io_stage == ZIO_STAGE_DONE);
1316 1316                  zio_add_child(spa->spa_suspend_zio_root, zio);
1317 1317          }
1318 1318  
1319 1319          mutex_exit(&spa->spa_suspend_lock);
1320 1320  }
1321 1321  
1322 1322  int
1323 1323  zio_resume(spa_t *spa)
1324 1324  {
1325 1325          zio_t *pio;
1326 1326  
1327 1327          /*
1328 1328           * Reexecute all previously suspended i/o.
1329 1329           */
1330 1330          mutex_enter(&spa->spa_suspend_lock);
1331 1331          spa->spa_suspended = B_FALSE;
1332 1332          cv_broadcast(&spa->spa_suspend_cv);
1333 1333          pio = spa->spa_suspend_zio_root;
1334 1334          spa->spa_suspend_zio_root = NULL;
1335 1335          mutex_exit(&spa->spa_suspend_lock);
1336 1336  
1337 1337          if (pio == NULL)
1338 1338                  return (0);
1339 1339  
1340 1340          zio_reexecute(pio);
1341 1341          return (zio_wait(pio));
1342 1342  }
1343 1343  
1344 1344  void
1345 1345  zio_resume_wait(spa_t *spa)
1346 1346  {
1347 1347          mutex_enter(&spa->spa_suspend_lock);
1348 1348          while (spa_suspended(spa))
1349 1349                  cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock);
1350 1350          mutex_exit(&spa->spa_suspend_lock);
1351 1351  }
1352 1352  
1353 1353  /*
1354 1354   * ==========================================================================
1355 1355   * Gang blocks.
1356 1356   *
1357 1357   * A gang block is a collection of small blocks that looks to the DMU
1358 1358   * like one large block.  When zio_dva_allocate() cannot find a block
1359 1359   * of the requested size, due to either severe fragmentation or the pool
1360 1360   * being nearly full, it calls zio_write_gang_block() to construct the
1361 1361   * block from smaller fragments.
1362 1362   *
1363 1363   * A gang block consists of a gang header (zio_gbh_phys_t) and up to
1364 1364   * three (SPA_GBH_NBLKPTRS) gang members.  The gang header is just like
1365 1365   * an indirect block: it's an array of block pointers.  It consumes
1366 1366   * only one sector and hence is allocatable regardless of fragmentation.
1367 1367   * The gang header's bps point to its gang members, which hold the data.
1368 1368   *
1369 1369   * Gang blocks are self-checksumming, using the bp's <vdev, offset, txg>
1370 1370   * as the verifier to ensure uniqueness of the SHA256 checksum.
1371 1371   * Critically, the gang block bp's blk_cksum is the checksum of the data,
1372 1372   * not the gang header.  This ensures that data block signatures (needed for
1373 1373   * deduplication) are independent of how the block is physically stored.
1374 1374   *
1375 1375   * Gang blocks can be nested: a gang member may itself be a gang block.
1376 1376   * Thus every gang block is a tree in which root and all interior nodes are
1377 1377   * gang headers, and the leaves are normal blocks that contain user data.
1378 1378   * The root of the gang tree is called the gang leader.
1379 1379   *
1380 1380   * To perform any operation (read, rewrite, free, claim) on a gang block,
1381 1381   * zio_gang_assemble() first assembles the gang tree (minus data leaves)
1382 1382   * in the io_gang_tree field of the original logical i/o by recursively
1383 1383   * reading the gang leader and all gang headers below it.  This yields
1384 1384   * an in-core tree containing the contents of every gang header and the
1385 1385   * bps for every constituent of the gang block.
1386 1386   *
1387 1387   * With the gang tree now assembled, zio_gang_issue() just walks the gang tree
1388 1388   * and invokes a callback on each bp.  To free a gang block, zio_gang_issue()
1389 1389   * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp.
1390 1390   * zio_claim_gang() provides a similarly trivial wrapper for zio_claim().
1391 1391   * zio_read_gang() is a wrapper around zio_read() that omits reading gang
1392 1392   * headers, since we already have those in io_gang_tree.  zio_rewrite_gang()
1393 1393   * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite()
1394 1394   * of the gang header plus zio_checksum_compute() of the data to update the
1395 1395   * gang header's blk_cksum as described above.
1396 1396   *
1397 1397   * The two-phase assemble/issue model solves the problem of partial failure --
1398 1398   * what if you'd freed part of a gang block but then couldn't read the
1399 1399   * gang header for another part?  Assembling the entire gang tree first
1400 1400   * ensures that all the necessary gang header I/O has succeeded before
1401 1401   * starting the actual work of free, claim, or write.  Once the gang tree
1402 1402   * is assembled, free and claim are in-memory operations that cannot fail.
1403 1403   *
1404 1404   * In the event that a gang write fails, zio_dva_unallocate() walks the
1405 1405   * gang tree to immediately free (i.e. insert back into the space map)
1406 1406   * everything we've allocated.  This ensures that we don't get ENOSPC
1407 1407   * errors during repeated suspend/resume cycles due to a flaky device.
1408 1408   *
1409 1409   * Gang rewrites only happen during sync-to-convergence.  If we can't assemble
1410 1410   * the gang tree, we won't modify the block, so we can safely defer the free
1411 1411   * (knowing that the block is still intact).  If we *can* assemble the gang
1412 1412   * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free
1413 1413   * each constituent bp and we can allocate a new block on the next sync pass.
1414 1414   *
1415 1415   * In all cases, the gang tree allows complete recovery from partial failure.
1416 1416   * ==========================================================================
1417 1417   */
1418 1418  
1419 1419  static zio_t *
1420 1420  zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1421 1421  {
1422 1422          if (gn != NULL)
1423 1423                  return (pio);
1424 1424  
1425 1425          return (zio_read(pio, pio->io_spa, bp, data, BP_GET_PSIZE(bp),
1426 1426              NULL, NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1427 1427              &pio->io_bookmark));
1428 1428  }
1429 1429  
1430 1430  zio_t *
1431 1431  zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1432 1432  {
1433 1433          zio_t *zio;
1434 1434  
1435 1435          if (gn != NULL) {
1436 1436                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1437 1437                      gn->gn_gbh, SPA_GANGBLOCKSIZE, NULL, NULL, pio->io_priority,
1438 1438                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1439 1439                  /*
1440 1440                   * As we rewrite each gang header, the pipeline will compute
1441 1441                   * a new gang block header checksum for it; but no one will
1442 1442                   * compute a new data checksum, so we do that here.  The one
1443 1443                   * exception is the gang leader: the pipeline already computed
1444 1444                   * its data checksum because that stage precedes gang assembly.
1445 1445                   * (Presently, nothing actually uses interior data checksums;
1446 1446                   * this is just good hygiene.)
1447 1447                   */
1448 1448                  if (gn != pio->io_gang_leader->io_gang_tree) {
1449 1449                          zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
1450 1450                              data, BP_GET_PSIZE(bp));
1451 1451                  }
1452 1452                  /*
1453 1453                   * If we are here to damage data for testing purposes,
1454 1454                   * leave the GBH alone so that we can detect the damage.
1455 1455                   */
1456 1456                  if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
1457 1457                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
1458 1458          } else {
1459 1459                  zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
1460 1460                      data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
1461 1461                      ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1462 1462          }
1463 1463  
1464 1464          return (zio);
1465 1465  }
1466 1466  
1467 1467  /* ARGSUSED */
1468 1468  zio_t *
1469 1469  zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1470 1470  {
1471 1471          return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
1472 1472              ZIO_GANG_CHILD_FLAGS(pio)));
1473 1473  }
1474 1474  
1475 1475  /* ARGSUSED */
1476 1476  zio_t *
1477 1477  zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
1478 1478  {
1479 1479          return (zio_claim(pio, pio->io_spa, pio->io_txg, bp,
1480 1480              NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
1481 1481  }
1482 1482  
1483 1483  static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = {
1484 1484          NULL,
1485 1485          zio_read_gang,
1486 1486          zio_rewrite_gang,
1487 1487          zio_free_gang,
1488 1488          zio_claim_gang,
1489 1489          NULL
1490 1490  };
1491 1491  
1492 1492  static void zio_gang_tree_assemble_done(zio_t *zio);
1493 1493  
1494 1494  static zio_gang_node_t *
1495 1495  zio_gang_node_alloc(zio_gang_node_t **gnpp)
1496 1496  {
1497 1497          zio_gang_node_t *gn;
1498 1498  
1499 1499          ASSERT(*gnpp == NULL);
1500 1500  
1501 1501          gn = kmem_zalloc(sizeof (*gn), KM_SLEEP);
1502 1502          gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE);
1503 1503          *gnpp = gn;
1504 1504  
1505 1505          return (gn);
1506 1506  }
1507 1507  
1508 1508  static void
1509 1509  zio_gang_node_free(zio_gang_node_t **gnpp)
1510 1510  {
1511 1511          zio_gang_node_t *gn = *gnpp;
1512 1512  
1513 1513          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1514 1514                  ASSERT(gn->gn_child[g] == NULL);
1515 1515  
1516 1516          zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE);
1517 1517          kmem_free(gn, sizeof (*gn));
1518 1518          *gnpp = NULL;
1519 1519  }
1520 1520  
1521 1521  static void
1522 1522  zio_gang_tree_free(zio_gang_node_t **gnpp)
1523 1523  {
1524 1524          zio_gang_node_t *gn = *gnpp;
1525 1525  
1526 1526          if (gn == NULL)
1527 1527                  return;
1528 1528  
1529 1529          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++)
1530 1530                  zio_gang_tree_free(&gn->gn_child[g]);
1531 1531  
1532 1532          zio_gang_node_free(gnpp);
1533 1533  }
1534 1534  
1535 1535  static void
1536 1536  zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
1537 1537  {
1538 1538          zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
1539 1539  
1540 1540          ASSERT(gio->io_gang_leader == gio);
1541 1541          ASSERT(BP_IS_GANG(bp));
1542 1542  
1543 1543          zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
1544 1544              SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
1545 1545              gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
1546 1546  }
1547 1547  
1548 1548  static void
1549 1549  zio_gang_tree_assemble_done(zio_t *zio)
1550 1550  {
1551 1551          zio_t *gio = zio->io_gang_leader;
1552 1552          zio_gang_node_t *gn = zio->io_private;
1553 1553          blkptr_t *bp = zio->io_bp;
1554 1554  
1555 1555          ASSERT(gio == zio_unique_parent(zio));
1556 1556          ASSERT(zio->io_child_count == 0);
1557 1557  
1558 1558          if (zio->io_error)
1559 1559                  return;
1560 1560  
1561 1561          if (BP_SHOULD_BYTESWAP(bp))
1562 1562                  byteswap_uint64_array(zio->io_data, zio->io_size);
1563 1563  
1564 1564          ASSERT(zio->io_data == gn->gn_gbh);
1565 1565          ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
1566 1566          ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1567 1567  
1568 1568          for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1569 1569                  blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1570 1570                  if (!BP_IS_GANG(gbp))
1571 1571                          continue;
1572 1572                  zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
1573 1573          }
1574 1574  }
1575 1575  
1576 1576  static void
1577 1577  zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
1578 1578  {
1579 1579          zio_t *gio = pio->io_gang_leader;
1580 1580          zio_t *zio;
1581 1581  
1582 1582          ASSERT(BP_IS_GANG(bp) == !!gn);
1583 1583          ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
1584 1584          ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
1585 1585  
1586 1586          /*
1587 1587           * If you're a gang header, your data is in gn->gn_gbh.
1588 1588           * If you're a gang member, your data is in 'data' and gn == NULL.
1589 1589           */
1590 1590          zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
1591 1591  
1592 1592          if (gn != NULL) {
1593 1593                  ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
1594 1594  
1595 1595                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
1596 1596                          blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
1597 1597                          if (BP_IS_HOLE(gbp))
1598 1598                                  continue;
1599 1599                          zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data);
1600 1600                          data = (char *)data + BP_GET_PSIZE(gbp);
1601 1601                  }
1602 1602          }
1603 1603  
1604 1604          if (gn == gio->io_gang_tree)
1605 1605                  ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
1606 1606  
1607 1607          if (zio != pio)
1608 1608                  zio_nowait(zio);
1609 1609  }
1610 1610  
1611 1611  static int
1612 1612  zio_gang_assemble(zio_t *zio)
1613 1613  {
1614 1614          blkptr_t *bp = zio->io_bp;
1615 1615  
1616 1616          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
1617 1617          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1618 1618  
1619 1619          zio->io_gang_leader = zio;
1620 1620  
1621 1621          zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
1622 1622  
1623 1623          return (ZIO_PIPELINE_CONTINUE);
1624 1624  }
1625 1625  
1626 1626  static int
1627 1627  zio_gang_issue(zio_t *zio)
1628 1628  {
1629 1629          blkptr_t *bp = zio->io_bp;
1630 1630  
1631 1631          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
1632 1632                  return (ZIO_PIPELINE_STOP);
1633 1633  
1634 1634          ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
1635 1635          ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
1636 1636  
1637 1637          if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
1638 1638                  zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
1639 1639          else
1640 1640                  zio_gang_tree_free(&zio->io_gang_tree);
1641 1641  
1642 1642          zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1643 1643  
1644 1644          return (ZIO_PIPELINE_CONTINUE);
1645 1645  }
1646 1646  
1647 1647  static void
1648 1648  zio_write_gang_member_ready(zio_t *zio)
1649 1649  {
1650 1650          zio_t *pio = zio_unique_parent(zio);
1651 1651          zio_t *gio = zio->io_gang_leader;
1652 1652          dva_t *cdva = zio->io_bp->blk_dva;
1653 1653          dva_t *pdva = pio->io_bp->blk_dva;
1654 1654          uint64_t asize;
1655 1655  
1656 1656          if (BP_IS_HOLE(zio->io_bp))
1657 1657                  return;
1658 1658  
1659 1659          ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
1660 1660  
1661 1661          ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
1662 1662          ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
1663 1663          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
1664 1664          ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
1665 1665          ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
1666 1666  
1667 1667          mutex_enter(&pio->io_lock);
1668 1668          for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) {
1669 1669                  ASSERT(DVA_GET_GANG(&pdva[d]));
1670 1670                  asize = DVA_GET_ASIZE(&pdva[d]);
1671 1671                  asize += DVA_GET_ASIZE(&cdva[d]);
1672 1672                  DVA_SET_ASIZE(&pdva[d], asize);
1673 1673          }
1674 1674          mutex_exit(&pio->io_lock);
1675 1675  }
1676 1676  
1677 1677  static int
1678 1678  zio_write_gang_block(zio_t *pio)
1679 1679  {
1680 1680          spa_t *spa = pio->io_spa;
1681 1681          blkptr_t *bp = pio->io_bp;
1682 1682          zio_t *gio = pio->io_gang_leader;
1683 1683          zio_t *zio;
1684 1684          zio_gang_node_t *gn, **gnpp;
1685 1685          zio_gbh_phys_t *gbh;
1686 1686          uint64_t txg = pio->io_txg;
1687 1687          uint64_t resid = pio->io_size;
1688 1688          uint64_t lsize;
1689 1689          int copies = gio->io_prop.zp_copies;
1690 1690          int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
1691 1691          zio_prop_t zp;
1692 1692          int error;
1693 1693  
1694 1694          error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
1695 1695              bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
1696 1696              METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
1697 1697          if (error) {
1698 1698                  pio->io_error = error;
1699 1699                  return (ZIO_PIPELINE_CONTINUE);
1700 1700          }
1701 1701  
1702 1702          if (pio == gio) {
1703 1703                  gnpp = &gio->io_gang_tree;
1704 1704          } else {
1705 1705                  gnpp = pio->io_private;
1706 1706                  ASSERT(pio->io_ready == zio_write_gang_member_ready);
1707 1707          }
1708 1708  
1709 1709          gn = zio_gang_node_alloc(gnpp);
1710 1710          gbh = gn->gn_gbh;
1711 1711          bzero(gbh, SPA_GANGBLOCKSIZE);
1712 1712  
1713 1713          /*
1714 1714           * Create the gang header.
1715 1715           */
1716 1716          zio = zio_rewrite(pio, spa, txg, bp, gbh, SPA_GANGBLOCKSIZE, NULL, NULL,
1717 1717              pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark);
1718 1718  
1719 1719          /*
1720 1720           * Create and nowait the gang children.
1721 1721           */
1722 1722          for (int g = 0; resid != 0; resid -= lsize, g++) {
1723 1723                  lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g),
1724 1724                      SPA_MINBLOCKSIZE);
1725 1725                  ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
1726 1726  
1727 1727                  zp.zp_checksum = gio->io_prop.zp_checksum;
1728 1728                  zp.zp_compress = ZIO_COMPRESS_OFF;
1729 1729                  zp.zp_type = DMU_OT_NONE;
1730 1730                  zp.zp_level = 0;
1731 1731                  zp.zp_copies = gio->io_prop.zp_copies;
1732 1732                  zp.zp_dedup = 0;
1733 1733                  zp.zp_dedup_verify = 0;
1734 1734  
1735 1735                  zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
1736 1736                      (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
1737 1737                      zio_write_gang_member_ready, NULL, &gn->gn_child[g],
1738 1738                      pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio),
1739 1739                      &pio->io_bookmark));
1740 1740          }
1741 1741  
1742 1742          /*
1743 1743           * Set pio's pipeline to just wait for zio to finish.
1744 1744           */
1745 1745          pio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
1746 1746  
1747 1747          zio_nowait(zio);
1748 1748  
1749 1749          return (ZIO_PIPELINE_CONTINUE);
1750 1750  }
1751 1751  
1752 1752  /*
1753 1753   * ==========================================================================
1754 1754   * Dedup
1755 1755   * ==========================================================================
1756 1756   */
1757 1757  static void
1758 1758  zio_ddt_child_read_done(zio_t *zio)
1759 1759  {
1760 1760          blkptr_t *bp = zio->io_bp;
1761 1761          ddt_entry_t *dde = zio->io_private;
1762 1762          ddt_phys_t *ddp;
1763 1763          zio_t *pio = zio_unique_parent(zio);
1764 1764  
1765 1765          mutex_enter(&pio->io_lock);
1766 1766          ddp = ddt_phys_select(dde, bp);
1767 1767          if (zio->io_error == 0)
1768 1768                  ddt_phys_clear(ddp);    /* this ddp doesn't need repair */
1769 1769          if (zio->io_error == 0 && dde->dde_repair_data == NULL)
1770 1770                  dde->dde_repair_data = zio->io_data;
1771 1771          else
1772 1772                  zio_buf_free(zio->io_data, zio->io_size);
1773 1773          mutex_exit(&pio->io_lock);
1774 1774  }
1775 1775  
1776 1776  static int
1777 1777  zio_ddt_read_start(zio_t *zio)
1778 1778  {
1779 1779          blkptr_t *bp = zio->io_bp;
1780 1780  
1781 1781          ASSERT(BP_GET_DEDUP(bp));
1782 1782          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1783 1783          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1784 1784  
1785 1785          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1786 1786                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1787 1787                  ddt_entry_t *dde = ddt_repair_start(ddt, bp);
1788 1788                  ddt_phys_t *ddp = dde->dde_phys;
1789 1789                  ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
1790 1790                  blkptr_t blk;
1791 1791  
1792 1792                  ASSERT(zio->io_vsd == NULL);
1793 1793                  zio->io_vsd = dde;
1794 1794  
1795 1795                  if (ddp_self == NULL)
1796 1796                          return (ZIO_PIPELINE_CONTINUE);
1797 1797  
1798 1798                  for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
1799 1799                          if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
1800 1800                                  continue;
1801 1801                          ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
1802 1802                              &blk);
1803 1803                          zio_nowait(zio_read(zio, zio->io_spa, &blk,
1804 1804                              zio_buf_alloc(zio->io_size), zio->io_size,
1805 1805                              zio_ddt_child_read_done, dde, zio->io_priority,
1806 1806                              ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
1807 1807                              &zio->io_bookmark));
1808 1808                  }
1809 1809                  return (ZIO_PIPELINE_CONTINUE);
1810 1810          }
1811 1811  
1812 1812          zio_nowait(zio_read(zio, zio->io_spa, bp,
1813 1813              zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
1814 1814              ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
1815 1815  
1816 1816          return (ZIO_PIPELINE_CONTINUE);
1817 1817  }
1818 1818  
1819 1819  static int
1820 1820  zio_ddt_read_done(zio_t *zio)
1821 1821  {
1822 1822          blkptr_t *bp = zio->io_bp;
1823 1823  
1824 1824          if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
1825 1825                  return (ZIO_PIPELINE_STOP);
1826 1826  
1827 1827          ASSERT(BP_GET_DEDUP(bp));
1828 1828          ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
1829 1829          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
1830 1830  
1831 1831          if (zio->io_child_error[ZIO_CHILD_DDT]) {
1832 1832                  ddt_t *ddt = ddt_select(zio->io_spa, bp);
1833 1833                  ddt_entry_t *dde = zio->io_vsd;
1834 1834                  if (ddt == NULL) {
1835 1835                          ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
1836 1836                          return (ZIO_PIPELINE_CONTINUE);
1837 1837                  }
1838 1838                  if (dde == NULL) {
1839 1839                          zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
1840 1840                          zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
1841 1841                          return (ZIO_PIPELINE_STOP);
1842 1842                  }
1843 1843                  if (dde->dde_repair_data != NULL) {
1844 1844                          bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
1845 1845                          zio->io_child_error[ZIO_CHILD_DDT] = 0;
1846 1846                  }
1847 1847                  ddt_repair_done(ddt, dde);
1848 1848                  zio->io_vsd = NULL;
1849 1849          }
1850 1850  
1851 1851          ASSERT(zio->io_vsd == NULL);
1852 1852  
1853 1853          return (ZIO_PIPELINE_CONTINUE);
1854 1854  }
1855 1855  
1856 1856  static boolean_t
1857 1857  zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
1858 1858  {
1859 1859          spa_t *spa = zio->io_spa;
1860 1860  
1861 1861          /*
1862 1862           * Note: we compare the original data, not the transformed data,
1863 1863           * because when zio->io_bp is an override bp, we will not have
1864 1864           * pushed the I/O transforms.  That's an important optimization
1865 1865           * because otherwise we'd compress/encrypt all dmu_sync() data twice.
1866 1866           */
1867 1867          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1868 1868                  zio_t *lio = dde->dde_lead_zio[p];
1869 1869  
1870 1870                  if (lio != NULL) {
1871 1871                          return (lio->io_orig_size != zio->io_orig_size ||
1872 1872                              bcmp(zio->io_orig_data, lio->io_orig_data,
1873 1873                              zio->io_orig_size) != 0);
1874 1874                  }
1875 1875          }
1876 1876  
1877 1877          for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
1878 1878                  ddt_phys_t *ddp = &dde->dde_phys[p];
1879 1879  
1880 1880                  if (ddp->ddp_phys_birth != 0) {
1881 1881                          arc_buf_t *abuf = NULL;
1882 1882                          uint32_t aflags = ARC_WAIT;
1883 1883                          blkptr_t blk = *zio->io_bp;
1884 1884                          int error;
1885 1885  
1886 1886                          ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
1887 1887  
1888 1888                          ddt_exit(ddt);
1889 1889  
1890 1890                          error = arc_read_nolock(NULL, spa, &blk,
1891 1891                              arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
1892 1892                              ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
1893 1893                              &aflags, &zio->io_bookmark);
1894 1894  
1895 1895                          if (error == 0) {
1896 1896                                  if (arc_buf_size(abuf) != zio->io_orig_size ||
1897 1897                                      bcmp(abuf->b_data, zio->io_orig_data,
1898 1898                                      zio->io_orig_size) != 0)
1899 1899                                          error = EEXIST;
1900 1900                                  VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
1901 1901                          }
1902 1902  
1903 1903                          ddt_enter(ddt);
1904 1904                          return (error != 0);
1905 1905                  }
1906 1906          }
1907 1907  
1908 1908          return (B_FALSE);
1909 1909  }
1910 1910  
1911 1911  static void
1912 1912  zio_ddt_child_write_ready(zio_t *zio)
1913 1913  {
1914 1914          int p = zio->io_prop.zp_copies;
1915 1915          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1916 1916          ddt_entry_t *dde = zio->io_private;
1917 1917          ddt_phys_t *ddp = &dde->dde_phys[p];
1918 1918          zio_t *pio;
1919 1919  
1920 1920          if (zio->io_error)
1921 1921                  return;
1922 1922  
1923 1923          ddt_enter(ddt);
1924 1924  
1925 1925          ASSERT(dde->dde_lead_zio[p] == zio);
1926 1926  
1927 1927          ddt_phys_fill(ddp, zio->io_bp);
1928 1928  
1929 1929          while ((pio = zio_walk_parents(zio)) != NULL)
1930 1930                  ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
1931 1931  
1932 1932          ddt_exit(ddt);
1933 1933  }
1934 1934  
1935 1935  static void
1936 1936  zio_ddt_child_write_done(zio_t *zio)
1937 1937  {
1938 1938          int p = zio->io_prop.zp_copies;
1939 1939          ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
1940 1940          ddt_entry_t *dde = zio->io_private;
1941 1941          ddt_phys_t *ddp = &dde->dde_phys[p];
1942 1942  
1943 1943          ddt_enter(ddt);
1944 1944  
1945 1945          ASSERT(ddp->ddp_refcnt == 0);
1946 1946          ASSERT(dde->dde_lead_zio[p] == zio);
1947 1947          dde->dde_lead_zio[p] = NULL;
1948 1948  
1949 1949          if (zio->io_error == 0) {
1950 1950                  while (zio_walk_parents(zio) != NULL)
1951 1951                          ddt_phys_addref(ddp);
1952 1952          } else {
1953 1953                  ddt_phys_clear(ddp);
1954 1954          }
1955 1955  
1956 1956          ddt_exit(ddt);
1957 1957  }
1958 1958  
1959 1959  static void
1960 1960  zio_ddt_ditto_write_done(zio_t *zio)
1961 1961  {
1962 1962          int p = DDT_PHYS_DITTO;
1963 1963          zio_prop_t *zp = &zio->io_prop;
1964 1964          blkptr_t *bp = zio->io_bp;
1965 1965          ddt_t *ddt = ddt_select(zio->io_spa, bp);
1966 1966          ddt_entry_t *dde = zio->io_private;
1967 1967          ddt_phys_t *ddp = &dde->dde_phys[p];
1968 1968          ddt_key_t *ddk = &dde->dde_key;
1969 1969  
1970 1970          ddt_enter(ddt);
1971 1971  
1972 1972          ASSERT(ddp->ddp_refcnt == 0);
1973 1973          ASSERT(dde->dde_lead_zio[p] == zio);
1974 1974          dde->dde_lead_zio[p] = NULL;
1975 1975  
1976 1976          if (zio->io_error == 0) {
1977 1977                  ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
1978 1978                  ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
1979 1979                  ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
1980 1980                  if (ddp->ddp_phys_birth != 0)
1981 1981                          ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
1982 1982                  ddt_phys_fill(ddp, bp);
1983 1983          }
1984 1984  
1985 1985          ddt_exit(ddt);
1986 1986  }
1987 1987  
1988 1988  static int
1989 1989  zio_ddt_write(zio_t *zio)
1990 1990  {
1991 1991          spa_t *spa = zio->io_spa;
1992 1992          blkptr_t *bp = zio->io_bp;
1993 1993          uint64_t txg = zio->io_txg;
1994 1994          zio_prop_t *zp = &zio->io_prop;
1995 1995          int p = zp->zp_copies;
1996 1996          int ditto_copies;
1997 1997          zio_t *cio = NULL;
1998 1998          zio_t *dio = NULL;
1999 1999          ddt_t *ddt = ddt_select(spa, bp);
2000 2000          ddt_entry_t *dde;
2001 2001          ddt_phys_t *ddp;
2002 2002  
2003 2003          ASSERT(BP_GET_DEDUP(bp));
2004 2004          ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
2005 2005          ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
2006 2006  
2007 2007          ddt_enter(ddt);
2008 2008          dde = ddt_lookup(ddt, bp, B_TRUE);
2009 2009          ddp = &dde->dde_phys[p];
2010 2010  
2011 2011          if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
2012 2012                  /*
2013 2013                   * If we're using a weak checksum, upgrade to a strong checksum
2014 2014                   * and try again.  If we're already using a strong checksum,
2015 2015                   * we can't resolve it, so just convert to an ordinary write.
2016 2016                   * (And automatically e-mail a paper to Nature?)
2017 2017                   */
2018 2018                  if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
2019 2019                          zp->zp_checksum = spa_dedup_checksum(spa);
2020 2020                          zio_pop_transforms(zio);
2021 2021                          zio->io_stage = ZIO_STAGE_OPEN;
2022 2022                          BP_ZERO(bp);
2023 2023                  } else {
2024 2024                          zp->zp_dedup = 0;
2025 2025                  }
2026 2026                  zio->io_pipeline = ZIO_WRITE_PIPELINE;
2027 2027                  ddt_exit(ddt);
2028 2028                  return (ZIO_PIPELINE_CONTINUE);
2029 2029          }
2030 2030  
2031 2031          ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
2032 2032          ASSERT(ditto_copies < SPA_DVAS_PER_BP);
2033 2033  
2034 2034          if (ditto_copies > ddt_ditto_copies_present(dde) &&
2035 2035              dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
2036 2036                  zio_prop_t czp = *zp;
2037 2037  
2038 2038                  czp.zp_copies = ditto_copies;
2039 2039  
2040 2040                  /*
2041 2041                   * If we arrived here with an override bp, we won't have run
2042 2042                   * the transform stack, so we won't have the data we need to
2043 2043                   * generate a child i/o.  So, toss the override bp and restart.
2044 2044                   * This is safe, because using the override bp is just an
2045 2045                   * optimization; and it's rare, so the cost doesn't matter.
2046 2046                   */
2047 2047                  if (zio->io_bp_override) {
2048 2048                          zio_pop_transforms(zio);
2049 2049                          zio->io_stage = ZIO_STAGE_OPEN;
2050 2050                          zio->io_pipeline = ZIO_WRITE_PIPELINE;
2051 2051                          zio->io_bp_override = NULL;
2052 2052                          BP_ZERO(bp);
2053 2053                          ddt_exit(ddt);
2054 2054                          return (ZIO_PIPELINE_CONTINUE);
2055 2055                  }
2056 2056  
2057 2057                  dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2058 2058                      zio->io_orig_size, &czp, NULL,
2059 2059                      zio_ddt_ditto_write_done, dde, zio->io_priority,
2060 2060                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2061 2061  
2062 2062                  zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
2063 2063                  dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
2064 2064          }
2065 2065  
2066 2066          if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
2067 2067                  if (ddp->ddp_phys_birth != 0)
2068 2068                          ddt_bp_fill(ddp, bp, txg);
2069 2069                  if (dde->dde_lead_zio[p] != NULL)
2070 2070                          zio_add_child(zio, dde->dde_lead_zio[p]);
2071 2071                  else
2072 2072                          ddt_phys_addref(ddp);
2073 2073          } else if (zio->io_bp_override) {
2074 2074                  ASSERT(bp->blk_birth == txg);
2075 2075                  ASSERT(BP_EQUAL(bp, zio->io_bp_override));
2076 2076                  ddt_phys_fill(ddp, bp);
2077 2077                  ddt_phys_addref(ddp);
2078 2078          } else {
2079 2079                  cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
2080 2080                      zio->io_orig_size, zp, zio_ddt_child_write_ready,
2081 2081                      zio_ddt_child_write_done, dde, zio->io_priority,
2082 2082                      ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
2083 2083  
2084 2084                  zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
2085 2085                  dde->dde_lead_zio[p] = cio;
2086 2086          }
2087 2087  
2088 2088          ddt_exit(ddt);
2089 2089  
2090 2090          if (cio)
2091 2091                  zio_nowait(cio);
2092 2092          if (dio)
2093 2093                  zio_nowait(dio);
2094 2094  
2095 2095          return (ZIO_PIPELINE_CONTINUE);
2096 2096  }
2097 2097  
2098 2098  ddt_entry_t *freedde; /* for debugging */
2099 2099  
2100 2100  static int
2101 2101  zio_ddt_free(zio_t *zio)
2102 2102  {
2103 2103          spa_t *spa = zio->io_spa;
2104 2104          blkptr_t *bp = zio->io_bp;
2105 2105          ddt_t *ddt = ddt_select(spa, bp);
2106 2106          ddt_entry_t *dde;
2107 2107          ddt_phys_t *ddp;
2108 2108  
2109 2109          ASSERT(BP_GET_DEDUP(bp));
2110 2110          ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2111 2111  
2112 2112          ddt_enter(ddt);
2113 2113          freedde = dde = ddt_lookup(ddt, bp, B_TRUE);
2114 2114          ddp = ddt_phys_select(dde, bp);
2115 2115          ddt_phys_decref(ddp);
2116 2116          ddt_exit(ddt);
2117 2117  
2118 2118          return (ZIO_PIPELINE_CONTINUE);
2119 2119  }
2120 2120  
2121 2121  /*
2122 2122   * ==========================================================================
2123 2123   * Allocate and free blocks
2124 2124   * ==========================================================================
2125 2125   */
2126 2126  static int
2127 2127  zio_dva_allocate(zio_t *zio)
2128 2128  {
2129 2129          spa_t *spa = zio->io_spa;
2130 2130          metaslab_class_t *mc = spa_normal_class(spa);
2131 2131          blkptr_t *bp = zio->io_bp;
2132 2132          int error;
2133 2133          int flags = 0;
2134 2134  
2135 2135          if (zio->io_gang_leader == NULL) {
2136 2136                  ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
2137 2137                  zio->io_gang_leader = zio;
2138 2138          }
2139 2139  
2140 2140          ASSERT(BP_IS_HOLE(bp));
2141 2141          ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
2142 2142          ASSERT3U(zio->io_prop.zp_copies, >, 0);
2143 2143          ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
2144 2144          ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
2145 2145  
2146 2146          /*
2147 2147           * The dump device does not support gang blocks so allocation on
2148 2148           * behalf of the dump device (i.e. ZIO_FLAG_NODATA) must avoid
2149 2149           * the "fast" gang feature.
2150 2150           */
2151 2151          flags |= (zio->io_flags & ZIO_FLAG_NODATA) ? METASLAB_GANG_AVOID : 0;
2152 2152          flags |= (zio->io_flags & ZIO_FLAG_GANG_CHILD) ?
2153 2153              METASLAB_GANG_CHILD : 0;
2154 2154          error = metaslab_alloc(spa, mc, zio->io_size, bp,
2155 2155              zio->io_prop.zp_copies, zio->io_txg, NULL, flags);
2156 2156  
2157 2157          if (error) {
2158 2158                  spa_dbgmsg(spa, "%s: metaslab allocation failure: zio %p, "
2159 2159                      "size %llu, error %d", spa_name(spa), zio, zio->io_size,
2160 2160                      error);
2161 2161                  if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
2162 2162                          return (zio_write_gang_block(zio));
2163 2163                  zio->io_error = error;
2164 2164          }
2165 2165  
2166 2166          return (ZIO_PIPELINE_CONTINUE);
2167 2167  }
2168 2168  
2169 2169  static int
2170 2170  zio_dva_free(zio_t *zio)
2171 2171  {
2172 2172          metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE);
2173 2173  
2174 2174          return (ZIO_PIPELINE_CONTINUE);
2175 2175  }
2176 2176  
2177 2177  static int
2178 2178  zio_dva_claim(zio_t *zio)
2179 2179  {
2180 2180          int error;
2181 2181  
2182 2182          error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg);
2183 2183          if (error)
2184 2184                  zio->io_error = error;
2185 2185  
2186 2186          return (ZIO_PIPELINE_CONTINUE);
2187 2187  }
2188 2188  
2189 2189  /*
2190 2190   * Undo an allocation.  This is used by zio_done() when an I/O fails
2191 2191   * and we want to give back the block we just allocated.
2192 2192   * This handles both normal blocks and gang blocks.
2193 2193   */
2194 2194  static void
2195 2195  zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
2196 2196  {
2197 2197          ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2198 2198          ASSERT(zio->io_bp_override == NULL);
2199 2199  
2200 2200          if (!BP_IS_HOLE(bp))
2201 2201                  metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
2202 2202  
2203 2203          if (gn != NULL) {
2204 2204                  for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
2205 2205                          zio_dva_unallocate(zio, gn->gn_child[g],
2206 2206                              &gn->gn_gbh->zg_blkptr[g]);
2207 2207                  }
2208 2208          }
2209 2209  }
2210 2210  
2211 2211  /*
2212 2212   * Try to allocate an intent log block.  Return 0 on success, errno on failure.
2213 2213   */
2214 2214  int
2215 2215  zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
2216 2216      uint64_t size, boolean_t use_slog)
2217 2217  {
2218 2218          int error = 1;
2219 2219  
2220 2220          ASSERT(txg > spa_syncing_txg(spa));
2221 2221  
2222 2222          /*
2223 2223           * ZIL blocks are always contiguous (i.e. not gang blocks) so we
2224 2224           * set the METASLAB_GANG_AVOID flag so that they don't "fast gang"
2225 2225           * when allocating them.
2226 2226           */
2227 2227          if (use_slog) {
2228 2228                  error = metaslab_alloc(spa, spa_log_class(spa), size,
2229 2229                      new_bp, 1, txg, old_bp,
2230 2230                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2231 2231          }
2232 2232  
2233 2233          if (error) {
2234 2234                  error = metaslab_alloc(spa, spa_normal_class(spa), size,
2235 2235                      new_bp, 1, txg, old_bp,
2236 2236                      METASLAB_HINTBP_AVOID | METASLAB_GANG_AVOID);
2237 2237          }
2238 2238  
2239 2239          if (error == 0) {
2240 2240                  BP_SET_LSIZE(new_bp, size);
2241 2241                  BP_SET_PSIZE(new_bp, size);
2242 2242                  BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
2243 2243                  BP_SET_CHECKSUM(new_bp,
2244 2244                      spa_version(spa) >= SPA_VERSION_SLIM_ZIL
2245 2245                      ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
2246 2246                  BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
2247 2247                  BP_SET_LEVEL(new_bp, 0);
2248 2248                  BP_SET_DEDUP(new_bp, 0);
2249 2249                  BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
2250 2250          }
2251 2251  
2252 2252          return (error);
2253 2253  }
2254 2254  
2255 2255  /*
2256 2256   * Free an intent log block.
2257 2257   */
2258 2258  void
2259 2259  zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
2260 2260  {
2261 2261          ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
2262 2262          ASSERT(!BP_IS_GANG(bp));
2263 2263  
2264 2264          zio_free(spa, txg, bp);
2265 2265  }
2266 2266  
2267 2267  /*
2268 2268   * ==========================================================================
2269 2269   * Read and write to physical devices
2270 2270   * ==========================================================================
2271 2271   */
2272 2272  static int
2273 2273  zio_vdev_io_start(zio_t *zio)
2274 2274  {
2275 2275          vdev_t *vd = zio->io_vd;
2276 2276          uint64_t align;
2277 2277          spa_t *spa = zio->io_spa;
2278 2278  
2279 2279          ASSERT(zio->io_error == 0);
2280 2280          ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0);
2281 2281  
2282 2282          if (vd == NULL) {
2283 2283                  if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2284 2284                          spa_config_enter(spa, SCL_ZIO, zio, RW_READER);
2285 2285  
2286 2286                  /*
2287 2287                   * The mirror_ops handle multiple DVAs in a single BP.
2288 2288                   */
2289 2289                  return (vdev_mirror_ops.vdev_op_io_start(zio));
2290 2290          }
2291 2291  
2292 2292          /*
2293 2293           * We keep track of time-sensitive I/Os so that the scan thread
2294 2294           * can quickly react to certain workloads.  In particular, we care
2295 2295           * about non-scrubbing, top-level reads and writes with the following
2296 2296           * characteristics:
2297 2297           *      - synchronous writes of user data to non-slog devices
2298 2298           *      - any reads of user data
2299 2299           * When these conditions are met, adjust the timestamp of spa_last_io
2300 2300           * which allows the scan thread to adjust its workload accordingly.
2301 2301           */
2302 2302          if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL &&
2303 2303              vd == vd->vdev_top && !vd->vdev_islog &&
2304 2304              zio->io_bookmark.zb_objset != DMU_META_OBJSET &&
2305 2305              zio->io_txg != spa_syncing_txg(spa)) {
2306 2306                  uint64_t old = spa->spa_last_io;
2307 2307                  uint64_t new = ddi_get_lbolt64();
2308 2308                  if (old != new)
2309 2309                          (void) atomic_cas_64(&spa->spa_last_io, old, new);
2310 2310          }
2311 2311  
2312 2312          align = 1ULL << vd->vdev_top->vdev_ashift;
2313 2313  
2314 2314          if (P2PHASE(zio->io_size, align) != 0) {
2315 2315                  uint64_t asize = P2ROUNDUP(zio->io_size, align);
2316 2316                  char *abuf = zio_buf_alloc(asize);
2317 2317                  ASSERT(vd == vd->vdev_top);
2318 2318                  if (zio->io_type == ZIO_TYPE_WRITE) {
2319 2319                          bcopy(zio->io_data, abuf, zio->io_size);
2320 2320                          bzero(abuf + zio->io_size, asize - zio->io_size);
2321 2321                  }
2322 2322                  zio_push_transform(zio, abuf, asize, asize, zio_subblock);
2323 2323          }
2324 2324  
2325 2325          ASSERT(P2PHASE(zio->io_offset, align) == 0);
2326 2326          ASSERT(P2PHASE(zio->io_size, align) == 0);
2327 2327          VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
2328 2328  
2329 2329          /*
2330 2330           * If this is a repair I/O, and there's no self-healing involved --
2331 2331           * that is, we're just resilvering what we expect to resilver --
2332 2332           * then don't do the I/O unless zio's txg is actually in vd's DTL.
2333 2333           * This prevents spurious resilvering with nested replication.
2334 2334           * For example, given a mirror of mirrors, (A+B)+(C+D), if only
2335 2335           * A is out of date, we'll read from C+D, then use the data to
2336 2336           * resilver A+B -- but we don't actually want to resilver B, just A.
2337 2337           * The top-level mirror has no way to know this, so instead we just
2338 2338           * discard unnecessary repairs as we work our way down the vdev tree.
2339 2339           * The same logic applies to any form of nested replication:
2340 2340           * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
2341 2341           */
2342 2342          if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
2343 2343              !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
2344 2344              zio->io_txg != 0 && /* not a delegated i/o */
2345 2345              !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
2346 2346                  ASSERT(zio->io_type == ZIO_TYPE_WRITE);
2347 2347                  zio_vdev_io_bypass(zio);
2348 2348                  return (ZIO_PIPELINE_CONTINUE);
2349 2349          }
2350 2350  
2351 2351          if (vd->vdev_ops->vdev_op_leaf &&
2352 2352              (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
2353 2353  
2354 2354                  if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
2355 2355                          return (ZIO_PIPELINE_CONTINUE);
2356 2356  
2357 2357                  if ((zio = vdev_queue_io(zio)) == NULL)
2358 2358                          return (ZIO_PIPELINE_STOP);
2359 2359  
2360 2360                  if (!vdev_accessible(vd, zio)) {
2361 2361                          zio->io_error = ENXIO;
2362 2362                          zio_interrupt(zio);
2363 2363                          return (ZIO_PIPELINE_STOP);
2364 2364                  }
2365 2365          }
2366 2366  
2367 2367          return (vd->vdev_ops->vdev_op_io_start(zio));
2368 2368  }
2369 2369  
2370 2370  static int
2371 2371  zio_vdev_io_done(zio_t *zio)
2372 2372  {
2373 2373          vdev_t *vd = zio->io_vd;
2374 2374          vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops;
2375 2375          boolean_t unexpected_error = B_FALSE;
2376 2376  
2377 2377          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2378 2378                  return (ZIO_PIPELINE_STOP);
2379 2379  
2380 2380          ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE);
2381 2381  
2382 2382          if (vd != NULL && vd->vdev_ops->vdev_op_leaf) {
2383 2383  
2384 2384                  vdev_queue_io_done(zio);
2385 2385  
2386 2386                  if (zio->io_type == ZIO_TYPE_WRITE)
2387 2387                          vdev_cache_write(zio);
2388 2388  
2389 2389                  if (zio_injection_enabled && zio->io_error == 0)
2390 2390                          zio->io_error = zio_handle_device_injection(vd,
2391 2391                              zio, EIO);
2392 2392  
2393 2393                  if (zio_injection_enabled && zio->io_error == 0)
2394 2394                          zio->io_error = zio_handle_label_injection(zio, EIO);
2395 2395  
2396 2396                  if (zio->io_error) {
2397 2397                          if (!vdev_accessible(vd, zio)) {
2398 2398                                  zio->io_error = ENXIO;
2399 2399                          } else {
2400 2400                                  unexpected_error = B_TRUE;
2401 2401                          }
2402 2402                  }
2403 2403          }
2404 2404  
2405 2405          ops->vdev_op_io_done(zio);
2406 2406  
2407 2407          if (unexpected_error)
2408 2408                  VERIFY(vdev_probe(vd, zio) == NULL);
2409 2409  
2410 2410          return (ZIO_PIPELINE_CONTINUE);
2411 2411  }
2412 2412  
2413 2413  /*
2414 2414   * For non-raidz ZIOs, we can just copy aside the bad data read from the
2415 2415   * disk, and use that to finish the checksum ereport later.
2416 2416   */
2417 2417  static void
2418 2418  zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
2419 2419      const void *good_buf)
2420 2420  {
2421 2421          /* no processing needed */
2422 2422          zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
2423 2423  }
2424 2424  
2425 2425  /*ARGSUSED*/
2426 2426  void
2427 2427  zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
2428 2428  {
2429 2429          void *buf = zio_buf_alloc(zio->io_size);
2430 2430  
2431 2431          bcopy(zio->io_data, buf, zio->io_size);
2432 2432  
2433 2433          zcr->zcr_cbinfo = zio->io_size;
2434 2434          zcr->zcr_cbdata = buf;
2435 2435          zcr->zcr_finish = zio_vsd_default_cksum_finish;
2436 2436          zcr->zcr_free = zio_buf_free;
2437 2437  }
2438 2438  
2439 2439  static int
2440 2440  zio_vdev_io_assess(zio_t *zio)
2441 2441  {
2442 2442          vdev_t *vd = zio->io_vd;
2443 2443  
2444 2444          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE))
2445 2445                  return (ZIO_PIPELINE_STOP);
2446 2446  
2447 2447          if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER))
2448 2448                  spa_config_exit(zio->io_spa, SCL_ZIO, zio);
2449 2449  
2450 2450          if (zio->io_vsd != NULL) {
2451 2451                  zio->io_vsd_ops->vsd_free(zio);
2452 2452                  zio->io_vsd = NULL;
2453 2453          }
2454 2454  
2455 2455          if (zio_injection_enabled && zio->io_error == 0)
2456 2456                  zio->io_error = zio_handle_fault_injection(zio, EIO);
2457 2457  
2458 2458          /*
2459 2459           * If the I/O failed, determine whether we should attempt to retry it.
2460 2460           *
2461 2461           * On retry, we cut in line in the issue queue, since we don't want
2462 2462           * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
2463 2463           */
2464 2464          if (zio->io_error && vd == NULL &&
2465 2465              !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
2466 2466                  ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */
2467 2467                  ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS));  /* not a leaf */
2468 2468                  zio->io_error = 0;
2469 2469                  zio->io_flags |= ZIO_FLAG_IO_RETRY |
2470 2470                      ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
2471 2471                  zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
2472 2472                  zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
2473 2473                      zio_requeue_io_start_cut_in_line);
2474 2474                  return (ZIO_PIPELINE_STOP);
2475 2475          }
2476 2476  
2477 2477          /*
2478 2478           * If we got an error on a leaf device, convert it to ENXIO
2479 2479           * if the device is not accessible at all.
2480 2480           */
2481 2481          if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf &&
2482 2482              !vdev_accessible(vd, zio))
2483 2483                  zio->io_error = ENXIO;
2484 2484  
2485 2485          /*
2486 2486           * If we can't write to an interior vdev (mirror or RAID-Z),
2487 2487           * set vdev_cant_write so that we stop trying to allocate from it.
2488 2488           */
2489 2489          if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE &&
2490 2490              vd != NULL && !vd->vdev_ops->vdev_op_leaf)
2491 2491                  vd->vdev_cant_write = B_TRUE;
2492 2492  
2493 2493          if (zio->io_error)
2494 2494                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2495 2495  
2496 2496          return (ZIO_PIPELINE_CONTINUE);
2497 2497  }
2498 2498  
2499 2499  void
2500 2500  zio_vdev_io_reissue(zio_t *zio)
2501 2501  {
2502 2502          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2503 2503          ASSERT(zio->io_error == 0);
2504 2504  
2505 2505          zio->io_stage >>= 1;
2506 2506  }
2507 2507  
2508 2508  void
2509 2509  zio_vdev_io_redone(zio_t *zio)
2510 2510  {
2511 2511          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
2512 2512  
2513 2513          zio->io_stage >>= 1;
2514 2514  }
2515 2515  
2516 2516  void
2517 2517  zio_vdev_io_bypass(zio_t *zio)
2518 2518  {
2519 2519          ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
2520 2520          ASSERT(zio->io_error == 0);
2521 2521  
2522 2522          zio->io_flags |= ZIO_FLAG_IO_BYPASS;
2523 2523          zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
2524 2524  }
2525 2525  
2526 2526  /*
2527 2527   * ==========================================================================
2528 2528   * Generate and verify checksums
2529 2529   * ==========================================================================
2530 2530   */
2531 2531  static int
2532 2532  zio_checksum_generate(zio_t *zio)
2533 2533  {
2534 2534          blkptr_t *bp = zio->io_bp;
2535 2535          enum zio_checksum checksum;
2536 2536  
2537 2537          if (bp == NULL) {
2538 2538                  /*
2539 2539                   * This is zio_write_phys().
2540 2540                   * We're either generating a label checksum, or none at all.
2541 2541                   */
2542 2542                  checksum = zio->io_prop.zp_checksum;
2543 2543  
2544 2544                  if (checksum == ZIO_CHECKSUM_OFF)
2545 2545                          return (ZIO_PIPELINE_CONTINUE);
2546 2546  
2547 2547                  ASSERT(checksum == ZIO_CHECKSUM_LABEL);
2548 2548          } else {
2549 2549                  if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) {
2550 2550                          ASSERT(!IO_IS_ALLOCATING(zio));
2551 2551                          checksum = ZIO_CHECKSUM_GANG_HEADER;
2552 2552                  } else {
2553 2553                          checksum = BP_GET_CHECKSUM(bp);
2554 2554                  }
2555 2555          }
2556 2556  
2557 2557          zio_checksum_compute(zio, checksum, zio->io_data, zio->io_size);
2558 2558  
2559 2559          return (ZIO_PIPELINE_CONTINUE);
2560 2560  }
2561 2561  
2562 2562  static int
2563 2563  zio_checksum_verify(zio_t *zio)
2564 2564  {
2565 2565          zio_bad_cksum_t info;
2566 2566          blkptr_t *bp = zio->io_bp;
2567 2567          int error;
2568 2568  
2569 2569          ASSERT(zio->io_vd != NULL);
2570 2570  
2571 2571          if (bp == NULL) {
2572 2572                  /*
2573 2573                   * This is zio_read_phys().
2574 2574                   * We're either verifying a label checksum, or nothing at all.
2575 2575                   */
2576 2576                  if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF)
2577 2577                          return (ZIO_PIPELINE_CONTINUE);
2578 2578  
2579 2579                  ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
2580 2580          }
2581 2581  
2582 2582          if ((error = zio_checksum_error(zio, &info)) != 0) {
2583 2583                  zio->io_error = error;
2584 2584                  if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2585 2585                          zfs_ereport_start_checksum(zio->io_spa,
2586 2586                              zio->io_vd, zio, zio->io_offset,
2587 2587                              zio->io_size, NULL, &info);
2588 2588                  }
2589 2589          }
2590 2590  
2591 2591          return (ZIO_PIPELINE_CONTINUE);
2592 2592  }
2593 2593  
2594 2594  /*
2595 2595   * Called by RAID-Z to ensure we don't compute the checksum twice.
2596 2596   */
2597 2597  void
2598 2598  zio_checksum_verified(zio_t *zio)
2599 2599  {
2600 2600          zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
2601 2601  }
2602 2602  
2603 2603  /*
2604 2604   * ==========================================================================
2605 2605   * Error rank.  Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other.
2606 2606   * An error of 0 indictes success.  ENXIO indicates whole-device failure,
2607 2607   * which may be transient (e.g. unplugged) or permament.  ECKSUM and EIO
2608 2608   * indicate errors that are specific to one I/O, and most likely permanent.
2609 2609   * Any other error is presumed to be worse because we weren't expecting it.
2610 2610   * ==========================================================================
2611 2611   */
2612 2612  int
2613 2613  zio_worst_error(int e1, int e2)
2614 2614  {
2615 2615          static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO };
2616 2616          int r1, r2;
2617 2617  
2618 2618          for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++)
2619 2619                  if (e1 == zio_error_rank[r1])
2620 2620                          break;
2621 2621  
2622 2622          for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++)
2623 2623                  if (e2 == zio_error_rank[r2])
2624 2624                          break;
2625 2625  
2626 2626          return (r1 > r2 ? e1 : e2);
2627 2627  }
2628 2628  
2629 2629  /*
2630 2630   * ==========================================================================
2631 2631   * I/O completion
2632 2632   * ==========================================================================
2633 2633   */
2634 2634  static int
2635 2635  zio_ready(zio_t *zio)
2636 2636  {
2637 2637          blkptr_t *bp = zio->io_bp;
2638 2638          zio_t *pio, *pio_next;
2639 2639  
2640 2640          if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
2641 2641              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
2642 2642                  return (ZIO_PIPELINE_STOP);
2643 2643  
2644 2644          if (zio->io_ready) {
2645 2645                  ASSERT(IO_IS_ALLOCATING(zio));
2646 2646                  ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
2647 2647                  ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
2648 2648  
2649 2649                  zio->io_ready(zio);
2650 2650          }
2651 2651  
2652 2652          if (bp != NULL && bp != &zio->io_bp_copy)
2653 2653                  zio->io_bp_copy = *bp;
2654 2654  
2655 2655          if (zio->io_error)
2656 2656                  zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
2657 2657  
2658 2658          mutex_enter(&zio->io_lock);
2659 2659          zio->io_state[ZIO_WAIT_READY] = 1;
2660 2660          pio = zio_walk_parents(zio);
2661 2661          mutex_exit(&zio->io_lock);
2662 2662  
2663 2663          /*
2664 2664           * As we notify zio's parents, new parents could be added.
2665 2665           * New parents go to the head of zio's io_parent_list, however,
2666 2666           * so we will (correctly) not notify them.  The remainder of zio's
2667 2667           * io_parent_list, from 'pio_next' onward, cannot change because
2668 2668           * all parents must wait for us to be done before they can be done.
2669 2669           */
2670 2670          for (; pio != NULL; pio = pio_next) {
2671 2671                  pio_next = zio_walk_parents(zio);
2672 2672                  zio_notify_parent(pio, zio, ZIO_WAIT_READY);
2673 2673          }
2674 2674  
2675 2675          if (zio->io_flags & ZIO_FLAG_NODATA) {
2676 2676                  if (BP_IS_GANG(bp)) {
2677 2677                          zio->io_flags &= ~ZIO_FLAG_NODATA;
2678 2678                  } else {
2679 2679                          ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
2680 2680                          zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
2681 2681                  }
2682 2682          }
2683 2683  
2684 2684          if (zio_injection_enabled &&
2685 2685              zio->io_spa->spa_syncing_txg == zio->io_txg)
2686 2686                  zio_handle_ignored_writes(zio);
2687 2687  
2688 2688          return (ZIO_PIPELINE_CONTINUE);
2689 2689  }
2690 2690  
2691 2691  static int
2692 2692  zio_done(zio_t *zio)
2693 2693  {
2694 2694          spa_t *spa = zio->io_spa;
2695 2695          zio_t *lio = zio->io_logical;
2696 2696          blkptr_t *bp = zio->io_bp;
2697 2697          vdev_t *vd = zio->io_vd;
2698 2698          uint64_t psize = zio->io_size;
2699 2699          zio_t *pio, *pio_next;
2700 2700  
2701 2701          /*
2702 2702           * If our children haven't all completed,
2703 2703           * wait for them and then repeat this pipeline stage.
2704 2704           */
2705 2705          if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
2706 2706              zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
2707 2707              zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
2708 2708              zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
2709 2709                  return (ZIO_PIPELINE_STOP);
2710 2710  
2711 2711          for (int c = 0; c < ZIO_CHILD_TYPES; c++)
2712 2712                  for (int w = 0; w < ZIO_WAIT_TYPES; w++)
2713 2713                          ASSERT(zio->io_children[c][w] == 0);
2714 2714  
2715 2715          if (bp != NULL) {
2716 2716                  ASSERT(bp->blk_pad[0] == 0);
2717 2717                  ASSERT(bp->blk_pad[1] == 0);
2718 2718                  ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
2719 2719                      (bp == zio_unique_parent(zio)->io_bp));
2720 2720                  if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
2721 2721                      zio->io_bp_override == NULL &&
2722 2722                      !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
2723 2723                          ASSERT(!BP_SHOULD_BYTESWAP(bp));
2724 2724                          ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
2725 2725                          ASSERT(BP_COUNT_GANG(bp) == 0 ||
2726 2726                              (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
2727 2727                  }
2728 2728          }
2729 2729  
2730 2730          /*
2731 2731           * If there were child vdev/gang/ddt errors, they apply to us now.
2732 2732           */
2733 2733          zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
2734 2734          zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
2735 2735          zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
2736 2736  
2737 2737          /*
2738 2738           * If the I/O on the transformed data was successful, generate any
2739 2739           * checksum reports now while we still have the transformed data.
2740 2740           */
2741 2741          if (zio->io_error == 0) {
2742 2742                  while (zio->io_cksum_report != NULL) {
2743 2743                          zio_cksum_report_t *zcr = zio->io_cksum_report;
2744 2744                          uint64_t align = zcr->zcr_align;
2745 2745                          uint64_t asize = P2ROUNDUP(psize, align);
2746 2746                          char *abuf = zio->io_data;
2747 2747  
2748 2748                          if (asize != psize) {
2749 2749                                  abuf = zio_buf_alloc(asize);
2750 2750                                  bcopy(zio->io_data, abuf, psize);
2751 2751                                  bzero(abuf + psize, asize - psize);
2752 2752                          }
2753 2753  
2754 2754                          zio->io_cksum_report = zcr->zcr_next;
2755 2755                          zcr->zcr_next = NULL;
2756 2756                          zcr->zcr_finish(zcr, abuf);
2757 2757                          zfs_ereport_free_checksum(zcr);
2758 2758  
2759 2759                          if (asize != psize)
2760 2760                                  zio_buf_free(abuf, asize);
2761 2761                  }
2762 2762          }
2763 2763  
2764 2764          zio_pop_transforms(zio);        /* note: may set zio->io_error */
2765 2765  
2766 2766          vdev_stat_update(zio, psize);
2767 2767  
2768 2768          if (zio->io_error) {
2769 2769                  /*
2770 2770                   * If this I/O is attached to a particular vdev,
2771 2771                   * generate an error message describing the I/O failure
2772 2772                   * at the block level.  We ignore these errors if the
2773 2773                   * device is currently unavailable.
2774 2774                   */
2775 2775                  if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
2776 2776                          zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
2777 2777  
2778 2778                  if ((zio->io_error == EIO || !(zio->io_flags &
2779 2779                      (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
2780 2780                      zio == lio) {
2781 2781                          /*
2782 2782                           * For logical I/O requests, tell the SPA to log the
2783 2783                           * error and generate a logical data ereport.
2784 2784                           */
2785 2785                          spa_log_error(spa, zio);
2786 2786                          zfs_ereport_post(FM_EREPORT_ZFS_DATA, spa, NULL, zio,
2787 2787                              0, 0);
2788 2788                  }
2789 2789          }
2790 2790  
2791 2791          if (zio->io_error && zio == lio) {
2792 2792                  /*
2793 2793                   * Determine whether zio should be reexecuted.  This will
2794 2794                   * propagate all the way to the root via zio_notify_parent().
2795 2795                   */
2796 2796                  ASSERT(vd == NULL && bp != NULL);
2797 2797                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2798 2798  
2799 2799                  if (IO_IS_ALLOCATING(zio) &&
2800 2800                      !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
2801 2801                          if (zio->io_error != ENOSPC)
2802 2802                                  zio->io_reexecute |= ZIO_REEXECUTE_NOW;
2803 2803                          else
2804 2804                                  zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2805 2805                  }
2806 2806  
2807 2807                  if ((zio->io_type == ZIO_TYPE_READ ||
2808 2808                      zio->io_type == ZIO_TYPE_FREE) &&
2809 2809                      !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) &&
2810 2810                      zio->io_error == ENXIO &&
2811 2811                      spa_load_state(spa) == SPA_LOAD_NONE &&
2812 2812                      spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
2813 2813                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2814 2814  
2815 2815                  if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
2816 2816                          zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
2817 2817  
2818 2818                  /*
2819 2819                   * Here is a possibly good place to attempt to do
2820 2820                   * either combinatorial reconstruction or error correction
2821 2821                   * based on checksums.  It also might be a good place
2822 2822                   * to send out preliminary ereports before we suspend
2823 2823                   * processing.
2824 2824                   */
2825 2825          }
2826 2826  
2827 2827          /*
2828 2828           * If there were logical child errors, they apply to us now.
2829 2829           * We defer this until now to avoid conflating logical child
2830 2830           * errors with errors that happened to the zio itself when
2831 2831           * updating vdev stats and reporting FMA events above.
2832 2832           */
2833 2833          zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
2834 2834  
2835 2835          if ((zio->io_error || zio->io_reexecute) &&
2836 2836              IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
2837 2837              !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
2838 2838                  zio_dva_unallocate(zio, zio->io_gang_tree, bp);
2839 2839  
2840 2840          zio_gang_tree_free(&zio->io_gang_tree);
2841 2841  
2842 2842          /*
2843 2843           * Godfather I/Os should never suspend.
2844 2844           */
2845 2845          if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
2846 2846              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
2847 2847                  zio->io_reexecute = 0;
2848 2848  
2849 2849          if (zio->io_reexecute) {
2850 2850                  /*
2851 2851                   * This is a logical I/O that wants to reexecute.
2852 2852                   *
2853 2853                   * Reexecute is top-down.  When an i/o fails, if it's not
2854 2854                   * the root, it simply notifies its parent and sticks around.
2855 2855                   * The parent, seeing that it still has children in zio_done(),
2856 2856                   * does the same.  This percolates all the way up to the root.
2857 2857                   * The root i/o will reexecute or suspend the entire tree.
2858 2858                   *
2859 2859                   * This approach ensures that zio_reexecute() honors
2860 2860                   * all the original i/o dependency relationships, e.g.
2861 2861                   * parents not executing until children are ready.
2862 2862                   */
2863 2863                  ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
2864 2864  
2865 2865                  zio->io_gang_leader = NULL;
2866 2866  
2867 2867                  mutex_enter(&zio->io_lock);
2868 2868                  zio->io_state[ZIO_WAIT_DONE] = 1;
2869 2869                  mutex_exit(&zio->io_lock);
2870 2870  
2871 2871                  /*
2872 2872                   * "The Godfather" I/O monitors its children but is
2873 2873                   * not a true parent to them. It will track them through
2874 2874                   * the pipeline but severs its ties whenever they get into
2875 2875                   * trouble (e.g. suspended). This allows "The Godfather"
2876 2876                   * I/O to return status without blocking.
2877 2877                   */
2878 2878                  for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2879 2879                          zio_link_t *zl = zio->io_walk_link;
2880 2880                          pio_next = zio_walk_parents(zio);
2881 2881  
2882 2882                          if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
2883 2883                              (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
2884 2884                                  zio_remove_child(pio, zio, zl);
2885 2885                                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2886 2886                          }
2887 2887                  }
2888 2888  
2889 2889                  if ((pio = zio_unique_parent(zio)) != NULL) {
2890 2890                          /*
2891 2891                           * We're not a root i/o, so there's nothing to do
2892 2892                           * but notify our parent.  Don't propagate errors
2893 2893                           * upward since we haven't permanently failed yet.
2894 2894                           */
2895 2895                          ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
2896 2896                          zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
2897 2897                          zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2898 2898                  } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
2899 2899                          /*
2900 2900                           * We'd fail again if we reexecuted now, so suspend
2901 2901                           * until conditions improve (e.g. device comes online).
2902 2902                           */
2903 2903                          zio_suspend(spa, zio);
2904 2904                  } else {
2905 2905                          /*
2906 2906                           * Reexecution is potentially a huge amount of work.
2907 2907                           * Hand it off to the otherwise-unused claim taskq.
2908 2908                           */
2909 2909                          ASSERT(zio->io_tqent.tqent_next == NULL);
2910 2910                          (void) taskq_dispatch_ent(
2911 2911                              spa->spa_zio_taskq[ZIO_TYPE_CLAIM][ZIO_TASKQ_ISSUE],
2912 2912                              (task_func_t *)zio_reexecute, zio, 0,
2913 2913                              &zio->io_tqent);
2914 2914                  }
2915 2915                  return (ZIO_PIPELINE_STOP);
2916 2916          }
2917 2917  
2918 2918          ASSERT(zio->io_child_count == 0);
2919 2919          ASSERT(zio->io_reexecute == 0);
2920 2920          ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
2921 2921  
2922 2922          /*
2923 2923           * Report any checksum errors, since the I/O is complete.
2924 2924           */
2925 2925          while (zio->io_cksum_report != NULL) {
2926 2926                  zio_cksum_report_t *zcr = zio->io_cksum_report;
2927 2927                  zio->io_cksum_report = zcr->zcr_next;
2928 2928                  zcr->zcr_next = NULL;
2929 2929                  zcr->zcr_finish(zcr, NULL);
2930 2930                  zfs_ereport_free_checksum(zcr);
2931 2931          }
2932 2932  
2933 2933          /*
2934 2934           * It is the responsibility of the done callback to ensure that this
2935 2935           * particular zio is no longer discoverable for adoption, and as
2936 2936           * such, cannot acquire any new parents.
2937 2937           */
2938 2938          if (zio->io_done)
2939 2939                  zio->io_done(zio);
2940 2940  
2941 2941          mutex_enter(&zio->io_lock);
2942 2942          zio->io_state[ZIO_WAIT_DONE] = 1;
2943 2943          mutex_exit(&zio->io_lock);
2944 2944  
2945 2945          for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
2946 2946                  zio_link_t *zl = zio->io_walk_link;
2947 2947                  pio_next = zio_walk_parents(zio);
2948 2948                  zio_remove_child(pio, zio, zl);
2949 2949                  zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
2950 2950          }
2951 2951  
2952 2952          if (zio->io_waiter != NULL) {
2953 2953                  mutex_enter(&zio->io_lock);
2954 2954                  zio->io_executor = NULL;
2955 2955                  cv_broadcast(&zio->io_cv);
2956 2956                  mutex_exit(&zio->io_lock);
2957 2957          } else {
2958 2958                  zio_destroy(zio);
2959 2959          }
2960 2960  
2961 2961          return (ZIO_PIPELINE_STOP);
2962 2962  }
2963 2963  
2964 2964  /*
2965 2965   * ==========================================================================
2966 2966   * I/O pipeline definition
2967 2967   * ==========================================================================
2968 2968   */
2969 2969  static zio_pipe_stage_t *zio_pipeline[] = {
2970 2970          NULL,
2971 2971          zio_read_bp_init,
2972 2972          zio_free_bp_init,
2973 2973          zio_issue_async,
2974 2974          zio_write_bp_init,
2975 2975          zio_checksum_generate,
2976 2976          zio_ddt_read_start,
2977 2977          zio_ddt_read_done,
2978 2978          zio_ddt_write,
2979 2979          zio_ddt_free,
2980 2980          zio_gang_assemble,
2981 2981          zio_gang_issue,
  
    | 
      ↓ open down ↓ | 
    2065 lines elided | 
    
      ↑ open up ↑ | 
  
2982 2982          zio_dva_allocate,
2983 2983          zio_dva_free,
2984 2984          zio_dva_claim,
2985 2985          zio_ready,
2986 2986          zio_vdev_io_start,
2987 2987          zio_vdev_io_done,
2988 2988          zio_vdev_io_assess,
2989 2989          zio_checksum_verify,
2990 2990          zio_done
2991 2991  };
     2992 +
     2993 +/* dnp is the dnode for zb1->zb_object */
     2994 +boolean_t
     2995 +zbookmark_is_before(const dnode_phys_t *dnp, const zbookmark_t *zb1,
     2996 +    const zbookmark_t *zb2)
     2997 +{
     2998 +        uint64_t zb1nextL0, zb2thisobj;
     2999 +
     3000 +        ASSERT(zb1->zb_objset == zb2->zb_objset);
     3001 +        ASSERT(zb2->zb_level == 0);
     3002 +
     3003 +        /*
     3004 +         * A bookmark in the deadlist is considered to be after
     3005 +         * everything else.
     3006 +         */
     3007 +        if (zb2->zb_object == DMU_DEADLIST_OBJECT)
     3008 +                return (B_TRUE);
     3009 +
     3010 +        /* The objset_phys_t isn't before anything. */
     3011 +        if (dnp == NULL)
     3012 +                return (B_FALSE);
     3013 +
     3014 +        zb1nextL0 = (zb1->zb_blkid + 1) <<
     3015 +            ((zb1->zb_level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT));
     3016 +
     3017 +        zb2thisobj = zb2->zb_object ? zb2->zb_object :
     3018 +            zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
     3019 +
     3020 +        if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
     3021 +                uint64_t nextobj = zb1nextL0 *
     3022 +                    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
     3023 +                return (nextobj <= zb2thisobj);
     3024 +        }
     3025 +
     3026 +        if (zb1->zb_object < zb2thisobj)
     3027 +                return (B_TRUE);
     3028 +        if (zb1->zb_object > zb2thisobj)
     3029 +                return (B_FALSE);
     3030 +        if (zb2->zb_object == DMU_META_DNODE_OBJECT)
     3031 +                return (B_FALSE);
     3032 +        return (zb1nextL0 <= zb2->zb_blkid);
     3033 +}
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX