Print this page
NEX-16191 scrub after trim finds thousands of checksum errors
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-14571 remove isal support remnants
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-4003 WRC: System panics on debug build
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/vdev_raidz.c
          +++ new/usr/src/uts/common/fs/zfs/vdev_raidz.c
↓ open down ↓ 16 lines elided ↑ open up ↑
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26   26   * Copyright (c) 2014 Integros [integros.com]
       27 + * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  27   28   */
  28   29  
  29   30  #include <sys/zfs_context.h>
  30   31  #include <sys/spa.h>
       32 +#include <sys/spa_impl.h>
  31   33  #include <sys/vdev_impl.h>
  32   34  #include <sys/vdev_disk.h>
  33   35  #include <sys/vdev_file.h>
  34   36  #include <sys/vdev_raidz.h>
  35   37  #include <sys/zio.h>
  36   38  #include <sys/zio_checksum.h>
  37   39  #include <sys/abd.h>
  38   40  #include <sys/fs/zfs.h>
  39   41  #include <sys/fm/fs/zfs.h>
       42 +#include <sys/dkioc_free_util.h>
  40   43  
  41   44  /*
  42   45   * Virtual device vector for RAID-Z.
  43   46   *
  44   47   * This vdev supports single, double, and triple parity. For single parity,
  45   48   * we use a simple XOR of all the data columns. For double or triple parity,
  46   49   * we use a special case of Reed-Solomon coding. This extends the
  47   50   * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  48   51   * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  49   52   * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
↓ open down ↓ 110 lines elided ↑ open up ↑
 160  163          VDEV_RAIDZ_64MUL_2((x), mask); \
 161  164  }
 162  165  
 163  166  #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 164  167  
 165  168  /*
 166  169   * Force reconstruction to use the general purpose method.
 167  170   */
 168  171  int vdev_raidz_default_to_general;
 169  172  
 170      -/* Powers of 2 in the Galois field defined above. */
      173 +/*
      174 + * xor_p hook for external acceleration libraries.
      175 + */
      176 +int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
      177 +
      178 +/*
      179 + * These two tables represent powers and logs of 2 in the Galois field defined
      180 + * above. These values were computed by repeatedly multiplying by 2 as above.
      181 + */
 171  182  static const uint8_t vdev_raidz_pow2[256] = {
 172  183          0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 173  184          0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 174  185          0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 175  186          0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 176  187          0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 177  188          0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 178  189          0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 179  190          0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 180  191          0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
↓ open down ↓ 51 lines elided ↑ open up ↑
 232  243          0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 233  244          0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 234  245          0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 235  246          0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 236  247          0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 237  248          0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 238  249          0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 239  250  };
 240  251  
 241  252  static void vdev_raidz_generate_parity(raidz_map_t *rm);
      253 +static void vdev_raidz_trim_done(zio_t *zio);
 242  254  
 243  255  /*
 244  256   * Multiply a given number by 2 raised to the given power.
 245  257   */
 246  258  static uint8_t
 247  259  vdev_raidz_exp2(uint_t a, int exp)
 248  260  {
 249  261          if (a == 0)
 250  262                  return (0);
 251  263  
↓ open down ↓ 7 lines elided ↑ open up ↑
 259  271          return (vdev_raidz_pow2[exp]);
 260  272  }
 261  273  
 262  274  static void
 263  275  vdev_raidz_map_free(raidz_map_t *rm)
 264  276  {
 265  277          int c;
 266  278          size_t size;
 267  279  
 268  280          for (c = 0; c < rm->rm_firstdatacol; c++) {
 269      -                abd_free(rm->rm_col[c].rc_abd);
      281 +                /*
      282 +                 * TRIM doesn't allocate data blocks,
      283 +                 * so 'rc_abd' is NULL in this case.
      284 +                 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
      285 +                 * for more details.
      286 +                 */
      287 +                if (rm->rm_col[c].rc_abd != NULL)
      288 +                        abd_free(rm->rm_col[c].rc_abd);
 270  289  
 271  290                  if (rm->rm_col[c].rc_gdata != NULL)
 272  291                          zio_buf_free(rm->rm_col[c].rc_gdata,
 273  292                              rm->rm_col[c].rc_size);
 274  293          }
 275  294  
 276  295          size = 0;
 277  296          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 278      -                abd_put(rm->rm_col[c].rc_abd);
      297 +                /*
      298 +                 * TRIM doesn't allocate data blocks,
      299 +                 * so 'rc_abd' is NULL in this case
      300 +                 * See vdev_raidz_trim() and vdev_raidz_map_alloc()
      301 +                 * for more details.
      302 +                 */
      303 +                if (rm->rm_col[c].rc_abd != NULL)
      304 +                        abd_put(rm->rm_col[c].rc_abd);
 279  305                  size += rm->rm_col[c].rc_size;
 280  306          }
 281  307  
 282  308          if (rm->rm_abd_copy != NULL)
 283  309                  abd_free(rm->rm_abd_copy);
 284  310  
 285  311          kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 286  312  }
 287  313  
 288  314  static void
↓ open down ↓ 160 lines elided ↑ open up ↑
 449  475          }
 450  476          ASSERT3U(offset, ==, size);
 451  477  }
 452  478  
 453  479  static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 454  480          vdev_raidz_map_free_vsd,
 455  481          vdev_raidz_cksum_report
 456  482  };
 457  483  
 458  484  /*
 459      - * Divides the IO evenly across all child vdevs; usually, dcols is
 460      - * the number of children in the target vdev.
      485 + * Allocates and computes a raidz column map, which directs the raidz column
      486 + * handling algorithms where to locate and store data and parity columns for
      487 + * a particular DVA. Usually, dcols is the number of children in the target
      488 + * vdev.
      489 + *
      490 + * The `io_offset', `io_size' and `io_data' hold the offset, size and data
      491 + * of the zio for which this map is to be computed.
      492 + * The `unit_shift' parameter contains the minimum allocation bitshift of
      493 + * the storage pool. The `dcols' parameter contains the number of drives in
      494 + * this raidz vdev (including parity drives), with `nparity' denoting how
      495 + * many those contain the parity (one, two or three).
      496 + *
      497 + * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
      498 + * map to contain allocated buffers to hold column IO data or not (if
      499 + * you're using this function simply to determine raidz geometry, you'll
      500 + * want to pass B_FALSE here).
 461  501   */
 462  502  static raidz_map_t *
 463  503  vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
 464      -    uint64_t unit_shift, uint64_t dcols, uint64_t nparity)
      504 +    uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
      505 +    boolean_t alloc_data)
 465  506  {
 466  507          raidz_map_t *rm;
 467  508          /* The starting RAIDZ (parent) vdev sector of the block. */
 468  509          uint64_t b = offset >> unit_shift;
 469  510          /* The zio's size in units of the vdev's minimum sector size. */
 470  511          uint64_t s = size >> unit_shift;
 471  512          /* The first column for this stripe. */
 472  513          uint64_t f = b % dcols;
 473  514          /* The starting byte offset on each child vdev. */
 474  515          uint64_t o = (b / dcols) << unit_shift;
↓ open down ↓ 74 lines elided ↑ open up ↑
 549  590  
 550  591                  asize += rm->rm_col[c].rc_size;
 551  592          }
 552  593  
 553  594          ASSERT3U(asize, ==, tot << unit_shift);
 554  595          rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 555  596          rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 556  597          ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 557  598          ASSERT3U(rm->rm_nskip, <=, nparity);
 558  599  
 559      -        for (c = 0; c < rm->rm_firstdatacol; c++)
 560      -                rm->rm_col[c].rc_abd =
 561      -                    abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
      600 +        if (alloc_data) {
      601 +                for (c = 0; c < rm->rm_firstdatacol; c++) {
      602 +                        rm->rm_col[c].rc_abd =
      603 +                            abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
      604 +                }
 562  605  
 563      -        rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 564      -        off = rm->rm_col[c].rc_size;
      606 +                rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
      607 +                off = rm->rm_col[c].rc_size;
 565  608  
 566      -        for (c = c + 1; c < acols; c++) {
 567      -                rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 568      -                off += rm->rm_col[c].rc_size;
      609 +                for (c = c + 1; c < acols; c++) {
      610 +                        rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
      611 +                        off += rm->rm_col[c].rc_size;
      612 +                }
 569  613          }
 570  614  
 571  615          /*
 572  616           * If all data stored spans all columns, there's a danger that parity
 573  617           * will always be on the same device and, since parity isn't read
 574  618           * during normal operation, that that device's I/O bandwidth won't be
 575  619           * used effectively. We therefore switch the parity every 1MB.
 576  620           *
 577  621           * ... at least that was, ostensibly, the theory. As a practical
 578  622           * matter unless we juggle the parity between all devices evenly, we
↓ open down ↓ 81 lines elided ↑ open up ↑
 660  704                  *pqr->p ^= *src;
 661  705                  VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 662  706                  *pqr->q ^= *src;
 663  707                  VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 664  708                  *pqr->r ^= *src;
 665  709          }
 666  710  
 667  711          return (0);
 668  712  }
 669  713  
      714 +/*
      715 + * software acceleration of XOR calculations, requirements
      716 + *
      717 + * the (src/dst) vectors needs to be 64 byte aligned
      718 + * all the vectors have to be the same size
      719 + */
      720 +#define RAIDZ_ACCELERATION_ALIGNMENT    64ul
      721 +#define UNALIGNED(addr) \
      722 +        ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
      723 +
 670  724  static void
 671  725  vdev_raidz_generate_parity_p(raidz_map_t *rm)
 672  726  {
 673  727          uint64_t *p;
 674  728          int c;
 675  729          abd_t *src;
 676  730  
      731 +#if 0
      732 +        /* FIXME: needs to be reviewed and changed to support ABD */
      733 +        int parity_done;
      734 +        void *va[16];
      735 +        void **array;
      736 +        int j, nvects;
      737 +
      738 +        parity_done = 0;
      739 +        while (0 && zfs_xorp_hook && !parity_done) {
      740 +                unsigned long no_accel = 0;
      741 +                /* at least two columns (plus one for result) */
      742 +                if (rm->rm_cols < 3) {
      743 +                        DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
      744 +                        break;
      745 +                }
      746 +                /* check sizes and alignment */
      747 +                no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
      748 +                if (no_accel) {
      749 +                        DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
      750 +                            no_accel);
      751 +                        break;
      752 +                }
      753 +                pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
      754 +                nvects = 1; /* for the destination */
      755 +                for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
      756 +                        no_accel = UNALIGNED(rm->rm_col[c].rc_data);
      757 +                        if (no_accel) {
      758 +                                DTRACE_PROBE1(raidz_unaligned_src,
      759 +                                    unsigned long, no_accel);
      760 +                                break;
      761 +                        }
      762 +                        if (rm->rm_col[c].rc_size != pcount) {
      763 +                                DTRACE_PROBE(raidz_sizes_vary);
      764 +                                no_accel = 1;
      765 +                                break;
      766 +                        }
      767 +                        nvects++;
      768 +                }
      769 +                if (no_accel)
      770 +                        break;
      771 +                if (nvects > 16) {
      772 +                        array = kmem_alloc(nvects * sizeof (void *),
      773 +                            KM_NOSLEEP);
      774 +                        if (array == NULL) {
      775 +                                DTRACE_PROBE(raidz_alloc_failed);
      776 +                                break;
      777 +                        }
      778 +                } else {
      779 +                        array = va;
      780 +                }
      781 +                for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
      782 +                    c++, j++) {
      783 +                        array[j] = rm->rm_col[c].rc_data;
      784 +                }
      785 +                array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
      786 +                if (zfs_xorp_hook(nvects,
      787 +                    rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
      788 +                        DTRACE_PROBE(raidz_accel_failure);
      789 +                        break;
      790 +                }
      791 +                if (array != va) {
      792 +                        kmem_free(array, nvects * sizeof (void *));
      793 +                }
      794 +                parity_done = 1;
      795 +                DTRACE_PROBE(raidz_accel_success);
      796 +        }
      797 +        if (parity_done)
      798 +                return;
      799 +#endif
 677  800          for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 678  801                  src = rm->rm_col[c].rc_abd;
 679  802                  p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 680  803  
 681  804                  if (c == rm->rm_firstdatacol) {
 682  805                          abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 683  806                  } else {
 684  807                          struct pqr_struct pqr = { p, NULL, NULL };
 685  808                          (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 686  809                              vdev_raidz_p_func, &pqr);
↓ open down ↓ 1115 lines elided ↑ open up ↑
1802 1925           * contains the requisite offset of the data being read or written.
1803 1926           *
1804 1927           * Even if this I/O operation doesn't span the full block size, let's
1805 1928           * treat the on-disk format as if the only blocks are the complete 128
1806 1929           * KB size.
1807 1930           */
1808 1931          abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809 1932              SPA_OLD_MAXBLOCKSIZE);
1810 1933          rm = vdev_raidz_map_alloc(abd,
1811 1934              SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812      -            vd->vdev_children, vd->vdev_nparity);
     1935 +            vd->vdev_children, vd->vdev_nparity, B_TRUE);
1813 1936  
1814 1937          coloffset = origoffset;
1815 1938  
1816 1939          for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817 1940              c++, coloffset += rc->rc_size) {
1818 1941                  rc = &rm->rm_col[c];
1819 1942                  cvd = vd->vdev_child[rc->rc_devidx];
1820 1943  
1821 1944                  /*
1822 1945                   * Find the start and end of this column in the RAID-Z map,
↓ open down ↓ 44 lines elided ↑ open up ↑
1867 1990          uint64_t cols = vd->vdev_children;
1868 1991          uint64_t nparity = vd->vdev_nparity;
1869 1992  
1870 1993          asize = ((psize - 1) >> ashift) + 1;
1871 1994          asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872 1995          asize = roundup(asize, nparity + 1) << ashift;
1873 1996  
1874 1997          return (asize);
1875 1998  }
1876 1999  
     2000 +/*
     2001 + * Converts an allocated size on a raidz vdev back to a logical block
     2002 + * size. This is used in trimming to figure out the appropriate logical
     2003 + * size to pass to vdev_raidz_map_alloc when splitting up extents of free
     2004 + * space obtained from metaslabs. However, a range of free space on a
     2005 + * raidz vdev might have originally consisted of multiple blocks and
     2006 + * those, taken together with their skip blocks, might not always align
     2007 + * neatly to a new vdev_raidz_map_alloc covering the entire unified
     2008 + * range. So to ensure that the newly allocated raidz map *always* fits
     2009 + * within the asize passed to this function and never exceeds it (since
     2010 + * that might trim allocated data past it), we round it down to the
     2011 + * nearest suitable multiple of the vdev ashift (hence the "_floor" in
     2012 + * this function's name).
     2013 + * This function is in effect an inverse of vdev_raidz_asize. However,
     2014 + * since multiple psizes can map to a single asize (due to variable padding,
     2015 + * this function instead returns the largest chunk that still fits inside
     2016 + * the specified asize).
     2017 + */
     2018 +static uint64_t
     2019 +vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
     2020 +{
     2021 +        uint64_t psize;
     2022 +        uint64_t ashift = vd->vdev_top->vdev_ashift;
     2023 +        uint64_t cols = vd->vdev_children;
     2024 +        uint64_t nparity = vd->vdev_nparity;
     2025 +
     2026 +        psize = (asize - (nparity << ashift));
     2027 +        psize /= cols;
     2028 +        psize *= cols - nparity;
     2029 +        psize += (1 << ashift) - 1;
     2030 +
     2031 +        psize = P2ALIGN(psize, 1 << ashift);
     2032 +
     2033 +        return (psize);
     2034 +}
     2035 +
1877 2036  static void
1878 2037  vdev_raidz_child_done(zio_t *zio)
1879 2038  {
1880 2039          raidz_col_t *rc = zio->io_private;
1881 2040  
1882 2041          rc->rc_error = zio->io_error;
1883 2042          rc->rc_tried = 1;
1884 2043          rc->rc_skipped = 0;
1885 2044  }
1886 2045  
↓ open down ↓ 19 lines elided ↑ open up ↑
1906 2065  {
1907 2066          vdev_t *vd = zio->io_vd;
1908 2067          vdev_t *tvd = vd->vdev_top;
1909 2068          vdev_t *cvd;
1910 2069          raidz_map_t *rm;
1911 2070          raidz_col_t *rc;
1912 2071          int c, i;
1913 2072  
1914 2073          rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915 2074              tvd->vdev_ashift, vd->vdev_children,
1916      -            vd->vdev_nparity);
     2075 +            vd->vdev_nparity, B_TRUE);
1917 2076  
1918 2077          zio->io_vsd = rm;
1919 2078          zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920 2079  
1921 2080          ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922 2081  
1923 2082          if (zio->io_type == ZIO_TYPE_WRITE) {
1924 2083                  vdev_raidz_generate_parity(rm);
1925 2084  
1926 2085                  for (c = 0; c < rm->rm_cols; c++) {
↓ open down ↓ 68 lines elided ↑ open up ↑
1995 2154  
1996 2155  
1997 2156  /*
1998 2157   * Report a checksum error for a child of a RAID-Z device.
1999 2158   */
2000 2159  static void
2001 2160  raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002 2161  {
2003 2162          void *buf;
2004 2163          vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
     2164 +        vdev_stat_t *vs = &vd->vdev_stat;
     2165 +        spa_t *spa = zio->io_spa;
2005 2166  
2006 2167          if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007 2168                  zio_bad_cksum_t zbc;
2008 2169                  raidz_map_t *rm = zio->io_vsd;
2009 2170  
2010 2171                  mutex_enter(&vd->vdev_stat_lock);
2011 2172                  vd->vdev_stat.vs_checksum_errors++;
2012 2173                  mutex_exit(&vd->vdev_stat_lock);
2013 2174  
2014 2175                  zbc.zbc_has_cksum = 0;
2015 2176                  zbc.zbc_injected = rm->rm_ecksuminjected;
2016 2177  
2017 2178                  buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018 2179                  zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019 2180                      rc->rc_offset, rc->rc_size, buf, bad_data,
2020 2181                      &zbc);
2021 2182                  abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022 2183          }
     2184 +
     2185 +        if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
     2186 +            vs->vs_read_errors || vs->vs_write_errors) &&
     2187 +            !spa->spa_special_has_errors) {
     2188 +                spa->spa_special_has_errors = B_TRUE;
     2189 +        }
2023 2190  }
2024 2191  
2025 2192  /*
2026 2193   * We keep track of whether or not there were any injected errors, so that
2027 2194   * any ereports we generate can note it.
2028 2195   */
2029 2196  static int
2030 2197  raidz_checksum_verify(zio_t *zio)
2031 2198  {
2032 2199          zio_bad_cksum_t zbc;
↓ open down ↓ 255 lines elided ↑ open up ↑
2288 2455          raidz_col_t *rc;
2289 2456          int unexpected_errors = 0;
2290 2457          int parity_errors = 0;
2291 2458          int parity_untried = 0;
2292 2459          int data_errors = 0;
2293 2460          int total_errors = 0;
2294 2461          int n, c;
2295 2462          int tgts[VDEV_RAIDZ_MAXPARITY];
2296 2463          int code;
2297 2464  
2298      -        ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2299      -
2300 2465          ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301 2466          ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302 2467  
2303 2468          for (c = 0; c < rm->rm_cols; c++) {
2304 2469                  rc = &rm->rm_col[c];
2305 2470  
2306 2471                  if (rc->rc_error) {
2307 2472                          ASSERT(rc->rc_error != ECKSUM); /* child has no bp */
2308 2473  
2309 2474                          if (c < rm->rm_firstdatacol)
↓ open down ↓ 238 lines elided ↑ open up ↑
2548 2713  {
2549 2714          if (faulted > vd->vdev_nparity)
2550 2715                  vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551 2716                      VDEV_AUX_NO_REPLICAS);
2552 2717          else if (degraded + faulted != 0)
2553 2718                  vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554 2719          else
2555 2720                  vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556 2721  }
2557 2722  
     2723 +static inline void
     2724 +vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
     2725 +    const raidz_col_t *rc)
     2726 +{
     2727 +        uint64_t num_exts = *num_extsp;
     2728 +        ASSERT(rc->rc_size != 0);
     2729 +
     2730 +        if (dfl->dfl_num_exts > 0 &&
     2731 +            dfl->dfl_exts[num_exts - 1].dfle_start +
     2732 +            dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
     2733 +                dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
     2734 +        } else {
     2735 +                dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
     2736 +                dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
     2737 +                (*num_extsp)++;
     2738 +        }
     2739 +}
     2740 +
     2741 +/*
     2742 + * Processes a trim for a raidz vdev.
     2743 + */
     2744 +static void
     2745 +vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
     2746 +{
     2747 +        dkioc_free_list_t *dfl = trim_exts;
     2748 +        dkioc_free_list_t **sub_dfls;
     2749 +        uint64_t *sub_dfls_num_exts;
     2750 +
     2751 +        sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
     2752 +            KM_SLEEP);
     2753 +        sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
     2754 +            KM_SLEEP);
     2755 +        for (int i = 0; i < vd->vdev_children; i++) {
     2756 +                /*
     2757 +                 * We might over-allocate here, because the sub-lists can never
     2758 +                 * be longer than the parent list, but they can be shorter.
     2759 +                 * The underlying driver will discard zero-length extents.
     2760 +                 */
     2761 +                sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
     2762 +                sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
     2763 +                sub_dfls[i]->dfl_flags = dfl->dfl_flags;
     2764 +                sub_dfls[i]->dfl_offset = dfl->dfl_offset;
     2765 +                /* don't copy the check func, because it isn't raidz-aware */
     2766 +        }
     2767 +
     2768 +        /*
     2769 +         * Process all extents and redistribute them to the component vdevs
     2770 +         * according to a computed raidz map geometry.
     2771 +         */
     2772 +        for (int i = 0; i < dfl->dfl_num_exts; i++) {
     2773 +                uint64_t start = dfl->dfl_exts[i].dfle_start;
     2774 +                uint64_t length = dfl->dfl_exts[i].dfle_length;
     2775 +                raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
     2776 +                    vdev_raidz_psize_floor(vd, length), start,
     2777 +                    vd->vdev_top->vdev_ashift, vd->vdev_children,
     2778 +                    vd->vdev_nparity, B_FALSE);
     2779 +
     2780 +                for (uint64_t j = 0; j < rm->rm_cols; j++) {
     2781 +                        uint64_t devidx = rm->rm_col[j].rc_devidx;
     2782 +                        vdev_raidz_trim_append_rc(sub_dfls[devidx],
     2783 +                            &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
     2784 +                }
     2785 +                vdev_raidz_map_free(rm);
     2786 +        }
     2787 +
     2788 +        /*
     2789 +         * Issue the component ioctls as children of the parent zio.
     2790 +         */
     2791 +        for (int i = 0; i < vd->vdev_children; i++) {
     2792 +                if (sub_dfls_num_exts[i] != 0) {
     2793 +                        zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
     2794 +                            vd->vdev_child[i], DKIOCFREE,
     2795 +                            vdev_raidz_trim_done, sub_dfls[i],
     2796 +                            ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
     2797 +                            ZIO_FLAG_DONT_RETRY));
     2798 +                } else {
     2799 +                        dfl_free(sub_dfls[i]);
     2800 +                }
     2801 +        }
     2802 +        kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
     2803 +        kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
     2804 +}
     2805 +
     2806 +/*
     2807 + * Releases a dkioc_free_list_t from ioctls issued to component devices in
     2808 + * vdev_raidz_dkioc_free.
     2809 + */
     2810 +static void
     2811 +vdev_raidz_trim_done(zio_t *zio)
     2812 +{
     2813 +        ASSERT(zio->io_private != NULL);
     2814 +        dfl_free(zio->io_private);
     2815 +}
     2816 +
2558 2817  vdev_ops_t vdev_raidz_ops = {
2559 2818          vdev_raidz_open,
2560 2819          vdev_raidz_close,
2561 2820          vdev_raidz_asize,
2562 2821          vdev_raidz_io_start,
2563 2822          vdev_raidz_io_done,
2564 2823          vdev_raidz_state_change,
2565 2824          NULL,
2566 2825          NULL,
2567      -        NULL,
     2826 +        vdev_raidz_trim,
2568 2827          VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2569 2828          B_FALSE                 /* not a leaf vdev */
2570 2829  };
    
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX