Print this page
NEX-16191 scrub after trim finds thousands of checksum errors
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Joyce McIntosh <joyce.mcintosh@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-15749 zpool trim command for a raidz-pool causes panic
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
NEX-14571 remove isal support remnants
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4620 ZFS autotrim triggering is unreliable
NEX-4622 On-demand TRIM code illogically enumerates metaslabs via mg_ms_tree
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Reviewed by: Hans Rosenfeld <hans.rosenfeld@nexenta.com>
NEX-3984 On-demand TRIM
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
Conflicts:
        usr/src/common/zfs/zpool_prop.c
        usr/src/uts/common/sys/fs/zfs.h
NEX-4003 WRC: System panics on debug build
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Reviewed by: Josef 'Jeff' Sipek <josef.sipek@nexenta.com>
NEX-3558 KRRP Integration
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
re #8279 rb3915 need a mechanism to notify NMS about ZFS config changes (fix lint -courtesy of Yuri Pankov)
re #12584 rb4049 zfsxx latest code merge (fix lint - courtesy of Yuri Pankov)
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code


   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]

  27  */
  28 
  29 #include <sys/zfs_context.h>
  30 #include <sys/spa.h>

  31 #include <sys/vdev_impl.h>
  32 #include <sys/vdev_disk.h>
  33 #include <sys/vdev_file.h>
  34 #include <sys/vdev_raidz.h>
  35 #include <sys/zio.h>
  36 #include <sys/zio_checksum.h>
  37 #include <sys/abd.h>
  38 #include <sys/fs/zfs.h>
  39 #include <sys/fm/fs/zfs.h>

  40 
  41 /*
  42  * Virtual device vector for RAID-Z.
  43  *
  44  * This vdev supports single, double, and triple parity. For single parity,
  45  * we use a simple XOR of all the data columns. For double or triple parity,
  46  * we use a special case of Reed-Solomon coding. This extends the
  47  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  48  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  49  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  50  * former is also based. The latter is designed to provide higher performance
  51  * for writes.
  52  *
  53  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  54  * amended six years later identifying a critical flaw that invalidates its
  55  * claims. Nevertheless, the technique can be adapted to work for up to
  56  * triple parity. For additional parity, the amendment "Note: Correction to
  57  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  58  * is viable, but the additional complexity means that write performance will
  59  * suffer.


 150 { \
 151         (mask) = (x) & 0x8080808080808080ULL; \
 152         (mask) = ((mask) << 1) - ((mask) >> 7); \
 153         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 154             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 155 }
 156 
 157 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 158 { \
 159         VDEV_RAIDZ_64MUL_2((x), mask); \
 160         VDEV_RAIDZ_64MUL_2((x), mask); \
 161 }
 162 
 163 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 164 
 165 /*
 166  * Force reconstruction to use the general purpose method.
 167  */
 168 int vdev_raidz_default_to_general;
 169 
 170 /* Powers of 2 in the Galois field defined above. */








 171 static const uint8_t vdev_raidz_pow2[256] = {
 172         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 173         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 174         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 175         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 176         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 177         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 178         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 179         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 180         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 181         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 182         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 183         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 184         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 185         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 186         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 187         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 188         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 189         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 190         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,


 222         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 223         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 224         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 225         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 226         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 227         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 228         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 229         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 230         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 231         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 232         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 233         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 234         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 235         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 236         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 237         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 238         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 239 };
 240 
 241 static void vdev_raidz_generate_parity(raidz_map_t *rm);

 242 
 243 /*
 244  * Multiply a given number by 2 raised to the given power.
 245  */
 246 static uint8_t
 247 vdev_raidz_exp2(uint_t a, int exp)
 248 {
 249         if (a == 0)
 250                 return (0);
 251 
 252         ASSERT(exp >= 0);
 253         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 254 
 255         exp += vdev_raidz_log2[a];
 256         if (exp > 255)
 257                 exp -= 255;
 258 
 259         return (vdev_raidz_pow2[exp]);
 260 }
 261 
 262 static void
 263 vdev_raidz_map_free(raidz_map_t *rm)
 264 {
 265         int c;
 266         size_t size;
 267 
 268         for (c = 0; c < rm->rm_firstdatacol; c++) {







 269                 abd_free(rm->rm_col[c].rc_abd);
 270 
 271                 if (rm->rm_col[c].rc_gdata != NULL)
 272                         zio_buf_free(rm->rm_col[c].rc_gdata,
 273                             rm->rm_col[c].rc_size);
 274         }
 275 
 276         size = 0;
 277         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {







 278                 abd_put(rm->rm_col[c].rc_abd);
 279                 size += rm->rm_col[c].rc_size;
 280         }
 281 
 282         if (rm->rm_abd_copy != NULL)
 283                 abd_free(rm->rm_abd_copy);
 284 
 285         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 286 }
 287 
 288 static void
 289 vdev_raidz_map_free_vsd(zio_t *zio)
 290 {
 291         raidz_map_t *rm = zio->io_vsd;
 292 
 293         ASSERT0(rm->rm_freed);
 294         rm->rm_freed = 1;
 295 
 296         if (rm->rm_reports == 0)
 297                 vdev_raidz_map_free(rm);


 439 
 440         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 441                 raidz_col_t *col = &rm->rm_col[c];
 442                 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 443 
 444                 abd_copy(tmp, col->rc_abd, col->rc_size);
 445                 abd_put(col->rc_abd);
 446                 col->rc_abd = tmp;
 447 
 448                 offset += col->rc_size;
 449         }
 450         ASSERT3U(offset, ==, size);
 451 }
 452 
 453 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 454         vdev_raidz_map_free_vsd,
 455         vdev_raidz_cksum_report
 456 };
 457 
 458 /*
 459  * Divides the IO evenly across all child vdevs; usually, dcols is
 460  * the number of children in the target vdev.














 461  */
 462 static raidz_map_t *
 463 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
 464     uint64_t unit_shift, uint64_t dcols, uint64_t nparity)

 465 {
 466         raidz_map_t *rm;
 467         /* The starting RAIDZ (parent) vdev sector of the block. */
 468         uint64_t b = offset >> unit_shift;
 469         /* The zio's size in units of the vdev's minimum sector size. */
 470         uint64_t s = size >> unit_shift;
 471         /* The first column for this stripe. */
 472         uint64_t f = b % dcols;
 473         /* The starting byte offset on each child vdev. */
 474         uint64_t o = (b / dcols) << unit_shift;
 475         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 476         uint64_t off = 0;
 477 
 478         /*
 479          * "Quotient": The number of data sectors for this stripe on all but
 480          * the "big column" child vdevs that also contain "remainder" data.
 481          */
 482         q = s / (dcols - nparity);
 483 
 484         /*


 539                 rm->rm_col[c].rc_error = 0;
 540                 rm->rm_col[c].rc_tried = 0;
 541                 rm->rm_col[c].rc_skipped = 0;
 542 
 543                 if (c >= acols)
 544                         rm->rm_col[c].rc_size = 0;
 545                 else if (c < bc)
 546                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 547                 else
 548                         rm->rm_col[c].rc_size = q << unit_shift;
 549 
 550                 asize += rm->rm_col[c].rc_size;
 551         }
 552 
 553         ASSERT3U(asize, ==, tot << unit_shift);
 554         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 555         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 556         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 557         ASSERT3U(rm->rm_nskip, <=, nparity);
 558 
 559         for (c = 0; c < rm->rm_firstdatacol; c++)

 560                 rm->rm_col[c].rc_abd =
 561                     abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);

 562 
 563         rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 564         off = rm->rm_col[c].rc_size;
 565 
 566         for (c = c + 1; c < acols; c++) {
 567                 rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 568                 off += rm->rm_col[c].rc_size;
 569         }

 570 
 571         /*
 572          * If all data stored spans all columns, there's a danger that parity
 573          * will always be on the same device and, since parity isn't read
 574          * during normal operation, that that device's I/O bandwidth won't be
 575          * used effectively. We therefore switch the parity every 1MB.
 576          *
 577          * ... at least that was, ostensibly, the theory. As a practical
 578          * matter unless we juggle the parity between all devices evenly, we
 579          * won't see any benefit. Further, occasional writes that aren't a
 580          * multiple of the LCM of the number of children and the minimum
 581          * stripe width are sufficient to avoid pessimal behavior.
 582          * Unfortunately, this decision created an implicit on-disk format
 583          * requirement that we need to support for all eternity, but only
 584          * for single-parity RAID-Z.
 585          *
 586          * If we intend to skip a sector in the zeroth column for padding
 587          * we must make sure to note this swap. We will never intend to
 588          * skip the first column since at least one data and one parity
 589          * column must appear in each row.


 650 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 651 {
 652         struct pqr_struct *pqr = private;
 653         const uint64_t *src = buf;
 654         uint64_t mask;
 655         int i, cnt = size / sizeof (src[0]);
 656 
 657         ASSERT(pqr->p && pqr->q && pqr->r);
 658 
 659         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 660                 *pqr->p ^= *src;
 661                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 662                 *pqr->q ^= *src;
 663                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 664                 *pqr->r ^= *src;
 665         }
 666 
 667         return (0);
 668 }
 669 










 670 static void
 671 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 672 {
 673         uint64_t *p;
 674         int c;
 675         abd_t *src;
 676 
























 677         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {













































 678                 src = rm->rm_col[c].rc_abd;
 679                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 680 
 681                 if (c == rm->rm_firstdatacol) {
 682                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 683                 } else {
 684                         struct pqr_struct pqr = { p, NULL, NULL };
 685                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 686                             vdev_raidz_p_func, &pqr);
 687                 }
 688         }
 689 }
 690 
 691 static void
 692 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 693 {
 694         uint64_t *p, *q, pcnt, ccnt, mask, i;
 695         int c;
 696         abd_t *src;
 697 


1792          * Don't write past the end of the block
1793          */
1794         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1795 
1796         start = offset;
1797         end = start + size;
1798 
1799         /*
1800          * Allocate a RAID-Z map for this block.  Note that this block starts
1801          * from the "original" offset, this is, the offset of the extent which
1802          * contains the requisite offset of the data being read or written.
1803          *
1804          * Even if this I/O operation doesn't span the full block size, let's
1805          * treat the on-disk format as if the only blocks are the complete 128
1806          * KB size.
1807          */
1808         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1809             SPA_OLD_MAXBLOCKSIZE);
1810         rm = vdev_raidz_map_alloc(abd,
1811             SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1812             vd->vdev_children, vd->vdev_nparity);
1813 
1814         coloffset = origoffset;
1815 
1816         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1817             c++, coloffset += rc->rc_size) {
1818                 rc = &rm->rm_col[c];
1819                 cvd = vd->vdev_child[rc->rc_devidx];
1820 
1821                 /*
1822                  * Find the start and end of this column in the RAID-Z map,
1823                  * keeping in mind that the stated size and offset of the
1824                  * operation may not fill the entire column for this vdev.
1825                  *
1826                  * If any portion of the data spans this column, issue the
1827                  * appropriate operation to the vdev.
1828                  */
1829                 if (coloffset + rc->rc_size <= start)
1830                         continue;
1831                 if (coloffset >= end)
1832                         continue;


1857 #endif  /* KERNEL */
1858 
1859         return (err);
1860 }
1861 
1862 static uint64_t
1863 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1864 {
1865         uint64_t asize;
1866         uint64_t ashift = vd->vdev_top->vdev_ashift;
1867         uint64_t cols = vd->vdev_children;
1868         uint64_t nparity = vd->vdev_nparity;
1869 
1870         asize = ((psize - 1) >> ashift) + 1;
1871         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1872         asize = roundup(asize, nparity + 1) << ashift;
1873 
1874         return (asize);
1875 }
1876 




































1877 static void
1878 vdev_raidz_child_done(zio_t *zio)
1879 {
1880         raidz_col_t *rc = zio->io_private;
1881 
1882         rc->rc_error = zio->io_error;
1883         rc->rc_tried = 1;
1884         rc->rc_skipped = 0;
1885 }
1886 
1887 /*
1888  * Start an IO operation on a RAIDZ VDev
1889  *
1890  * Outline:
1891  * - For write operations:
1892  *   1. Generate the parity data
1893  *   2. Create child zio write operations to each column's vdev, for both
1894  *      data and parity.
1895  *   3. If the column skips any sectors for padding, create optional dummy
1896  *      write zio children for those areas to improve aggregation continuity.
1897  * - For read operations:
1898  *   1. Create child zio read operations to each data column's vdev to read
1899  *      the range of data required for zio.
1900  *   2. If this is a scrub or resilver operation, or if any of the data
1901  *      vdevs have had errors, then create zio read operations to the parity
1902  *      columns' VDevs as well.
1903  */
1904 static void
1905 vdev_raidz_io_start(zio_t *zio)
1906 {
1907         vdev_t *vd = zio->io_vd;
1908         vdev_t *tvd = vd->vdev_top;
1909         vdev_t *cvd;
1910         raidz_map_t *rm;
1911         raidz_col_t *rc;
1912         int c, i;
1913 
1914         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
1915             tvd->vdev_ashift, vd->vdev_children,
1916             vd->vdev_nparity);
1917 
1918         zio->io_vsd = rm;
1919         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
1920 
1921         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
1922 
1923         if (zio->io_type == ZIO_TYPE_WRITE) {
1924                 vdev_raidz_generate_parity(rm);
1925 
1926                 for (c = 0; c < rm->rm_cols; c++) {
1927                         rc = &rm->rm_col[c];
1928                         cvd = vd->vdev_child[rc->rc_devidx];
1929                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1930                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1931                             zio->io_type, zio->io_priority, 0,
1932                             vdev_raidz_child_done, rc));
1933                 }
1934 
1935                 /*
1936                  * Generate optional I/Os for any skipped sectors to improve


1985                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
1986                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
1987                             rc->rc_offset, rc->rc_abd, rc->rc_size,
1988                             zio->io_type, zio->io_priority, 0,
1989                             vdev_raidz_child_done, rc));
1990                 }
1991         }
1992 
1993         zio_execute(zio);
1994 }
1995 
1996 
1997 /*
1998  * Report a checksum error for a child of a RAID-Z device.
1999  */
2000 static void
2001 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2002 {
2003         void *buf;
2004         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];


2005 
2006         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2007                 zio_bad_cksum_t zbc;
2008                 raidz_map_t *rm = zio->io_vsd;
2009 
2010                 mutex_enter(&vd->vdev_stat_lock);
2011                 vd->vdev_stat.vs_checksum_errors++;
2012                 mutex_exit(&vd->vdev_stat_lock);
2013 
2014                 zbc.zbc_has_cksum = 0;
2015                 zbc.zbc_injected = rm->rm_ecksuminjected;
2016 
2017                 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2018                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2019                     rc->rc_offset, rc->rc_size, buf, bad_data,
2020                     &zbc);
2021                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2022         }






2023 }
2024 
2025 /*
2026  * We keep track of whether or not there were any injected errors, so that
2027  * any ereports we generate can note it.
2028  */
2029 static int
2030 raidz_checksum_verify(zio_t *zio)
2031 {
2032         zio_bad_cksum_t zbc;
2033         raidz_map_t *rm = zio->io_vsd;
2034 
2035         int ret = zio_checksum_error(zio, &zbc);
2036         if (ret != 0 && zbc.zbc_injected != 0)
2037                 rm->rm_ecksuminjected = 1;
2038 
2039         return (ret);
2040 }
2041 
2042 /*


2278  *      d. If that doesn't work, return an error.
2279  *   3. If there were unexpected errors or this is a resilver operation,
2280  *      rewrite the vdevs that had errors.
2281  */
2282 static void
2283 vdev_raidz_io_done(zio_t *zio)
2284 {
2285         vdev_t *vd = zio->io_vd;
2286         vdev_t *cvd;
2287         raidz_map_t *rm = zio->io_vsd;
2288         raidz_col_t *rc;
2289         int unexpected_errors = 0;
2290         int parity_errors = 0;
2291         int parity_untried = 0;
2292         int data_errors = 0;
2293         int total_errors = 0;
2294         int n, c;
2295         int tgts[VDEV_RAIDZ_MAXPARITY];
2296         int code;
2297 
2298         ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
2299 
2300         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2301         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2302 
2303         for (c = 0; c < rm->rm_cols; c++) {
2304                 rc = &rm->rm_col[c];
2305 
2306                 if (rc->rc_error) {
2307                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
2308 
2309                         if (c < rm->rm_firstdatacol)
2310                                 parity_errors++;
2311                         else
2312                                 data_errors++;
2313 
2314                         if (!rc->rc_skipped)
2315                                 unexpected_errors++;
2316 
2317                         total_errors++;
2318                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2319                         parity_untried++;


2538                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2539                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2540                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2541                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2542                 }
2543         }
2544 }
2545 
2546 static void
2547 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2548 {
2549         if (faulted > vd->vdev_nparity)
2550                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2551                     VDEV_AUX_NO_REPLICAS);
2552         else if (degraded + faulted != 0)
2553                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2554         else
2555                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2556 }
2557 






























































































2558 vdev_ops_t vdev_raidz_ops = {
2559         vdev_raidz_open,
2560         vdev_raidz_close,
2561         vdev_raidz_asize,
2562         vdev_raidz_io_start,
2563         vdev_raidz_io_done,
2564         vdev_raidz_state_change,
2565         NULL,
2566         NULL,
2567         NULL,
2568         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2569         B_FALSE                 /* not a leaf vdev */
2570 };


   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Integros [integros.com]
  27  * Copyright 2016 Nexenta Systems, Inc.  All rights reserved.
  28  */
  29 
  30 #include <sys/zfs_context.h>
  31 #include <sys/spa.h>
  32 #include <sys/spa_impl.h>
  33 #include <sys/vdev_impl.h>
  34 #include <sys/vdev_disk.h>
  35 #include <sys/vdev_file.h>
  36 #include <sys/vdev_raidz.h>
  37 #include <sys/zio.h>
  38 #include <sys/zio_checksum.h>
  39 #include <sys/abd.h>
  40 #include <sys/fs/zfs.h>
  41 #include <sys/fm/fs/zfs.h>
  42 #include <sys/dkioc_free_util.h>
  43 
  44 /*
  45  * Virtual device vector for RAID-Z.
  46  *
  47  * This vdev supports single, double, and triple parity. For single parity,
  48  * we use a simple XOR of all the data columns. For double or triple parity,
  49  * we use a special case of Reed-Solomon coding. This extends the
  50  * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
  51  * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
  52  * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
  53  * former is also based. The latter is designed to provide higher performance
  54  * for writes.
  55  *
  56  * Note that the Plank paper claimed to support arbitrary N+M, but was then
  57  * amended six years later identifying a critical flaw that invalidates its
  58  * claims. Nevertheless, the technique can be adapted to work for up to
  59  * triple parity. For additional parity, the amendment "Note: Correction to
  60  * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
  61  * is viable, but the additional complexity means that write performance will
  62  * suffer.


 153 { \
 154         (mask) = (x) & 0x8080808080808080ULL; \
 155         (mask) = ((mask) << 1) - ((mask) >> 7); \
 156         (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
 157             ((mask) & 0x1d1d1d1d1d1d1d1d); \
 158 }
 159 
 160 #define VDEV_RAIDZ_64MUL_4(x, mask) \
 161 { \
 162         VDEV_RAIDZ_64MUL_2((x), mask); \
 163         VDEV_RAIDZ_64MUL_2((x), mask); \
 164 }
 165 
 166 #define VDEV_LABEL_OFFSET(x)    (x + VDEV_LABEL_START_SIZE)
 167 
 168 /*
 169  * Force reconstruction to use the general purpose method.
 170  */
 171 int vdev_raidz_default_to_general;
 172 
 173 /*
 174  * xor_p hook for external acceleration libraries.
 175  */
 176 int (*zfs_xorp_hook)(int vects, int len, void **array) = NULL;
 177 
 178 /*
 179  * These two tables represent powers and logs of 2 in the Galois field defined
 180  * above. These values were computed by repeatedly multiplying by 2 as above.
 181  */
 182 static const uint8_t vdev_raidz_pow2[256] = {
 183         0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
 184         0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
 185         0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
 186         0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
 187         0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
 188         0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
 189         0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
 190         0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
 191         0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
 192         0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
 193         0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
 194         0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
 195         0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
 196         0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
 197         0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
 198         0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
 199         0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
 200         0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
 201         0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,


 233         0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
 234         0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
 235         0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
 236         0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
 237         0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
 238         0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
 239         0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
 240         0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
 241         0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
 242         0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
 243         0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
 244         0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
 245         0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
 246         0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
 247         0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
 248         0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
 249         0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 250 };
 251 
 252 static void vdev_raidz_generate_parity(raidz_map_t *rm);
 253 static void vdev_raidz_trim_done(zio_t *zio);
 254 
 255 /*
 256  * Multiply a given number by 2 raised to the given power.
 257  */
 258 static uint8_t
 259 vdev_raidz_exp2(uint_t a, int exp)
 260 {
 261         if (a == 0)
 262                 return (0);
 263 
 264         ASSERT(exp >= 0);
 265         ASSERT(vdev_raidz_log2[a] > 0 || a == 1);
 266 
 267         exp += vdev_raidz_log2[a];
 268         if (exp > 255)
 269                 exp -= 255;
 270 
 271         return (vdev_raidz_pow2[exp]);
 272 }
 273 
 274 static void
 275 vdev_raidz_map_free(raidz_map_t *rm)
 276 {
 277         int c;
 278         size_t size;
 279 
 280         for (c = 0; c < rm->rm_firstdatacol; c++) {
 281                 /*
 282                  * TRIM doesn't allocate data blocks,
 283                  * so 'rc_abd' is NULL in this case.
 284                  * See vdev_raidz_trim() and vdev_raidz_map_alloc()
 285                  * for more details.
 286                  */
 287                 if (rm->rm_col[c].rc_abd != NULL)
 288                         abd_free(rm->rm_col[c].rc_abd);
 289 
 290                 if (rm->rm_col[c].rc_gdata != NULL)
 291                         zio_buf_free(rm->rm_col[c].rc_gdata,
 292                             rm->rm_col[c].rc_size);
 293         }
 294 
 295         size = 0;
 296         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 297                 /*
 298                  * TRIM doesn't allocate data blocks,
 299                  * so 'rc_abd' is NULL in this case
 300                  * See vdev_raidz_trim() and vdev_raidz_map_alloc()
 301                  * for more details.
 302                  */
 303                 if (rm->rm_col[c].rc_abd != NULL)
 304                         abd_put(rm->rm_col[c].rc_abd);
 305                 size += rm->rm_col[c].rc_size;
 306         }
 307 
 308         if (rm->rm_abd_copy != NULL)
 309                 abd_free(rm->rm_abd_copy);
 310 
 311         kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
 312 }
 313 
 314 static void
 315 vdev_raidz_map_free_vsd(zio_t *zio)
 316 {
 317         raidz_map_t *rm = zio->io_vsd;
 318 
 319         ASSERT0(rm->rm_freed);
 320         rm->rm_freed = 1;
 321 
 322         if (rm->rm_reports == 0)
 323                 vdev_raidz_map_free(rm);


 465 
 466         for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 467                 raidz_col_t *col = &rm->rm_col[c];
 468                 abd_t *tmp = abd_get_offset(rm->rm_abd_copy, offset);
 469 
 470                 abd_copy(tmp, col->rc_abd, col->rc_size);
 471                 abd_put(col->rc_abd);
 472                 col->rc_abd = tmp;
 473 
 474                 offset += col->rc_size;
 475         }
 476         ASSERT3U(offset, ==, size);
 477 }
 478 
 479 static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
 480         vdev_raidz_map_free_vsd,
 481         vdev_raidz_cksum_report
 482 };
 483 
 484 /*
 485  * Allocates and computes a raidz column map, which directs the raidz column
 486  * handling algorithms where to locate and store data and parity columns for
 487  * a particular DVA. Usually, dcols is the number of children in the target
 488  * vdev.
 489  *
 490  * The `io_offset', `io_size' and `io_data' hold the offset, size and data
 491  * of the zio for which this map is to be computed.
 492  * The `unit_shift' parameter contains the minimum allocation bitshift of
 493  * the storage pool. The `dcols' parameter contains the number of drives in
 494  * this raidz vdev (including parity drives), with `nparity' denoting how
 495  * many those contain the parity (one, two or three).
 496  *
 497  * The `alloc_io_bufs' flag denotes whether you want the constructed raidz
 498  * map to contain allocated buffers to hold column IO data or not (if
 499  * you're using this function simply to determine raidz geometry, you'll
 500  * want to pass B_FALSE here).
 501  */
 502 static raidz_map_t *
 503 vdev_raidz_map_alloc(abd_t *abd, uint64_t size, uint64_t offset,
 504     uint64_t unit_shift, uint64_t dcols, uint64_t nparity,
 505     boolean_t alloc_data)
 506 {
 507         raidz_map_t *rm;
 508         /* The starting RAIDZ (parent) vdev sector of the block. */
 509         uint64_t b = offset >> unit_shift;
 510         /* The zio's size in units of the vdev's minimum sector size. */
 511         uint64_t s = size >> unit_shift;
 512         /* The first column for this stripe. */
 513         uint64_t f = b % dcols;
 514         /* The starting byte offset on each child vdev. */
 515         uint64_t o = (b / dcols) << unit_shift;
 516         uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 517         uint64_t off = 0;
 518 
 519         /*
 520          * "Quotient": The number of data sectors for this stripe on all but
 521          * the "big column" child vdevs that also contain "remainder" data.
 522          */
 523         q = s / (dcols - nparity);
 524 
 525         /*


 580                 rm->rm_col[c].rc_error = 0;
 581                 rm->rm_col[c].rc_tried = 0;
 582                 rm->rm_col[c].rc_skipped = 0;
 583 
 584                 if (c >= acols)
 585                         rm->rm_col[c].rc_size = 0;
 586                 else if (c < bc)
 587                         rm->rm_col[c].rc_size = (q + 1) << unit_shift;
 588                 else
 589                         rm->rm_col[c].rc_size = q << unit_shift;
 590 
 591                 asize += rm->rm_col[c].rc_size;
 592         }
 593 
 594         ASSERT3U(asize, ==, tot << unit_shift);
 595         rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
 596         rm->rm_nskip = roundup(tot, nparity + 1) - tot;
 597         ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
 598         ASSERT3U(rm->rm_nskip, <=, nparity);
 599 
 600         if (alloc_data) {
 601                 for (c = 0; c < rm->rm_firstdatacol; c++) {
 602                         rm->rm_col[c].rc_abd =
 603                             abd_alloc_linear(rm->rm_col[c].rc_size, B_TRUE);
 604                 }
 605 
 606                 rm->rm_col[c].rc_abd = abd_get_offset(abd, 0);
 607                 off = rm->rm_col[c].rc_size;
 608 
 609                 for (c = c + 1; c < acols; c++) {
 610                         rm->rm_col[c].rc_abd = abd_get_offset(abd, off);
 611                         off += rm->rm_col[c].rc_size;
 612                 }
 613         }
 614 
 615         /*
 616          * If all data stored spans all columns, there's a danger that parity
 617          * will always be on the same device and, since parity isn't read
 618          * during normal operation, that that device's I/O bandwidth won't be
 619          * used effectively. We therefore switch the parity every 1MB.
 620          *
 621          * ... at least that was, ostensibly, the theory. As a practical
 622          * matter unless we juggle the parity between all devices evenly, we
 623          * won't see any benefit. Further, occasional writes that aren't a
 624          * multiple of the LCM of the number of children and the minimum
 625          * stripe width are sufficient to avoid pessimal behavior.
 626          * Unfortunately, this decision created an implicit on-disk format
 627          * requirement that we need to support for all eternity, but only
 628          * for single-parity RAID-Z.
 629          *
 630          * If we intend to skip a sector in the zeroth column for padding
 631          * we must make sure to note this swap. We will never intend to
 632          * skip the first column since at least one data and one parity
 633          * column must appear in each row.


 694 vdev_raidz_pqr_func(void *buf, size_t size, void *private)
 695 {
 696         struct pqr_struct *pqr = private;
 697         const uint64_t *src = buf;
 698         uint64_t mask;
 699         int i, cnt = size / sizeof (src[0]);
 700 
 701         ASSERT(pqr->p && pqr->q && pqr->r);
 702 
 703         for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) {
 704                 *pqr->p ^= *src;
 705                 VDEV_RAIDZ_64MUL_2(*pqr->q, mask);
 706                 *pqr->q ^= *src;
 707                 VDEV_RAIDZ_64MUL_4(*pqr->r, mask);
 708                 *pqr->r ^= *src;
 709         }
 710 
 711         return (0);
 712 }
 713 
 714 /*
 715  * software acceleration of XOR calculations, requirements
 716  *
 717  * the (src/dst) vectors needs to be 64 byte aligned
 718  * all the vectors have to be the same size
 719  */
 720 #define RAIDZ_ACCELERATION_ALIGNMENT    64ul
 721 #define UNALIGNED(addr) \
 722         ((unsigned long)(addr) & (RAIDZ_ACCELERATION_ALIGNMENT-1))
 723 
 724 static void
 725 vdev_raidz_generate_parity_p(raidz_map_t *rm)
 726 {
 727         uint64_t *p;
 728         int c;
 729         abd_t *src;
 730 
 731 #if 0
 732         /* FIXME: needs to be reviewed and changed to support ABD */
 733         int parity_done;
 734         void *va[16];
 735         void **array;
 736         int j, nvects;
 737 
 738         parity_done = 0;
 739         while (0 && zfs_xorp_hook && !parity_done) {
 740                 unsigned long no_accel = 0;
 741                 /* at least two columns (plus one for result) */
 742                 if (rm->rm_cols < 3) {
 743                         DTRACE_PROBE1(raidz_few_cols, int, rm->rm_cols);
 744                         break;
 745                 }
 746                 /* check sizes and alignment */
 747                 no_accel = UNALIGNED(rm->rm_col[VDEV_RAIDZ_P].rc_data);
 748                 if (no_accel) {
 749                         DTRACE_PROBE1(raidz_unaligned_dst, unsigned long,
 750                             no_accel);
 751                         break;
 752                 }
 753                 pcount = rm->rm_col[rm->rm_firstdatacol].rc_size;
 754                 nvects = 1; /* for the destination */
 755                 for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 756                         no_accel = UNALIGNED(rm->rm_col[c].rc_data);
 757                         if (no_accel) {
 758                                 DTRACE_PROBE1(raidz_unaligned_src,
 759                                     unsigned long, no_accel);
 760                                 break;
 761                         }
 762                         if (rm->rm_col[c].rc_size != pcount) {
 763                                 DTRACE_PROBE(raidz_sizes_vary);
 764                                 no_accel = 1;
 765                                 break;
 766                         }
 767                         nvects++;
 768                 }
 769                 if (no_accel)
 770                         break;
 771                 if (nvects > 16) {
 772                         array = kmem_alloc(nvects * sizeof (void *),
 773                             KM_NOSLEEP);
 774                         if (array == NULL) {
 775                                 DTRACE_PROBE(raidz_alloc_failed);
 776                                 break;
 777                         }
 778                 } else {
 779                         array = va;
 780                 }
 781                 for (j = 0, c = rm->rm_firstdatacol; c < rm->rm_cols;
 782                     c++, j++) {
 783                         array[j] = rm->rm_col[c].rc_data;
 784                 }
 785                 array[j] = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 786                 if (zfs_xorp_hook(nvects,
 787                     rm->rm_col[rm->rm_firstdatacol].rc_size, array)) {
 788                         DTRACE_PROBE(raidz_accel_failure);
 789                         break;
 790                 }
 791                 if (array != va) {
 792                         kmem_free(array, nvects * sizeof (void *));
 793                 }
 794                 parity_done = 1;
 795                 DTRACE_PROBE(raidz_accel_success);
 796         }
 797         if (parity_done)
 798                 return;
 799 #endif
 800         for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 801                 src = rm->rm_col[c].rc_abd;
 802                 p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd);
 803 
 804                 if (c == rm->rm_firstdatacol) {
 805                         abd_copy_to_buf(p, src, rm->rm_col[c].rc_size);
 806                 } else {
 807                         struct pqr_struct pqr = { p, NULL, NULL };
 808                         (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size,
 809                             vdev_raidz_p_func, &pqr);
 810                 }
 811         }
 812 }
 813 
 814 static void
 815 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 816 {
 817         uint64_t *p, *q, pcnt, ccnt, mask, i;
 818         int c;
 819         abd_t *src;
 820 


1915          * Don't write past the end of the block
1916          */
1917         VERIFY3U(offset + size, <=, origoffset + SPA_OLD_MAXBLOCKSIZE);
1918 
1919         start = offset;
1920         end = start + size;
1921 
1922         /*
1923          * Allocate a RAID-Z map for this block.  Note that this block starts
1924          * from the "original" offset, this is, the offset of the extent which
1925          * contains the requisite offset of the data being read or written.
1926          *
1927          * Even if this I/O operation doesn't span the full block size, let's
1928          * treat the on-disk format as if the only blocks are the complete 128
1929          * KB size.
1930          */
1931         abd_t *abd = abd_get_from_buf(data - (offset - origoffset),
1932             SPA_OLD_MAXBLOCKSIZE);
1933         rm = vdev_raidz_map_alloc(abd,
1934             SPA_OLD_MAXBLOCKSIZE, origoffset, tvd->vdev_ashift,
1935             vd->vdev_children, vd->vdev_nparity, B_TRUE);
1936 
1937         coloffset = origoffset;
1938 
1939         for (c = rm->rm_firstdatacol; c < rm->rm_cols;
1940             c++, coloffset += rc->rc_size) {
1941                 rc = &rm->rm_col[c];
1942                 cvd = vd->vdev_child[rc->rc_devidx];
1943 
1944                 /*
1945                  * Find the start and end of this column in the RAID-Z map,
1946                  * keeping in mind that the stated size and offset of the
1947                  * operation may not fill the entire column for this vdev.
1948                  *
1949                  * If any portion of the data spans this column, issue the
1950                  * appropriate operation to the vdev.
1951                  */
1952                 if (coloffset + rc->rc_size <= start)
1953                         continue;
1954                 if (coloffset >= end)
1955                         continue;


1980 #endif  /* KERNEL */
1981 
1982         return (err);
1983 }
1984 
1985 static uint64_t
1986 vdev_raidz_asize(vdev_t *vd, uint64_t psize)
1987 {
1988         uint64_t asize;
1989         uint64_t ashift = vd->vdev_top->vdev_ashift;
1990         uint64_t cols = vd->vdev_children;
1991         uint64_t nparity = vd->vdev_nparity;
1992 
1993         asize = ((psize - 1) >> ashift) + 1;
1994         asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity));
1995         asize = roundup(asize, nparity + 1) << ashift;
1996 
1997         return (asize);
1998 }
1999 
2000 /*
2001  * Converts an allocated size on a raidz vdev back to a logical block
2002  * size. This is used in trimming to figure out the appropriate logical
2003  * size to pass to vdev_raidz_map_alloc when splitting up extents of free
2004  * space obtained from metaslabs. However, a range of free space on a
2005  * raidz vdev might have originally consisted of multiple blocks and
2006  * those, taken together with their skip blocks, might not always align
2007  * neatly to a new vdev_raidz_map_alloc covering the entire unified
2008  * range. So to ensure that the newly allocated raidz map *always* fits
2009  * within the asize passed to this function and never exceeds it (since
2010  * that might trim allocated data past it), we round it down to the
2011  * nearest suitable multiple of the vdev ashift (hence the "_floor" in
2012  * this function's name).
2013  * This function is in effect an inverse of vdev_raidz_asize. However,
2014  * since multiple psizes can map to a single asize (due to variable padding,
2015  * this function instead returns the largest chunk that still fits inside
2016  * the specified asize).
2017  */
2018 static uint64_t
2019 vdev_raidz_psize_floor(vdev_t *vd, uint64_t asize)
2020 {
2021         uint64_t psize;
2022         uint64_t ashift = vd->vdev_top->vdev_ashift;
2023         uint64_t cols = vd->vdev_children;
2024         uint64_t nparity = vd->vdev_nparity;
2025 
2026         psize = (asize - (nparity << ashift));
2027         psize /= cols;
2028         psize *= cols - nparity;
2029         psize += (1 << ashift) - 1;
2030 
2031         psize = P2ALIGN(psize, 1 << ashift);
2032 
2033         return (psize);
2034 }
2035 
2036 static void
2037 vdev_raidz_child_done(zio_t *zio)
2038 {
2039         raidz_col_t *rc = zio->io_private;
2040 
2041         rc->rc_error = zio->io_error;
2042         rc->rc_tried = 1;
2043         rc->rc_skipped = 0;
2044 }
2045 
2046 /*
2047  * Start an IO operation on a RAIDZ VDev
2048  *
2049  * Outline:
2050  * - For write operations:
2051  *   1. Generate the parity data
2052  *   2. Create child zio write operations to each column's vdev, for both
2053  *      data and parity.
2054  *   3. If the column skips any sectors for padding, create optional dummy
2055  *      write zio children for those areas to improve aggregation continuity.
2056  * - For read operations:
2057  *   1. Create child zio read operations to each data column's vdev to read
2058  *      the range of data required for zio.
2059  *   2. If this is a scrub or resilver operation, or if any of the data
2060  *      vdevs have had errors, then create zio read operations to the parity
2061  *      columns' VDevs as well.
2062  */
2063 static void
2064 vdev_raidz_io_start(zio_t *zio)
2065 {
2066         vdev_t *vd = zio->io_vd;
2067         vdev_t *tvd = vd->vdev_top;
2068         vdev_t *cvd;
2069         raidz_map_t *rm;
2070         raidz_col_t *rc;
2071         int c, i;
2072 
2073         rm = vdev_raidz_map_alloc(zio->io_abd, zio->io_size, zio->io_offset,
2074             tvd->vdev_ashift, vd->vdev_children,
2075             vd->vdev_nparity, B_TRUE);
2076 
2077         zio->io_vsd = rm;
2078         zio->io_vsd_ops = &vdev_raidz_vsd_ops;
2079 
2080         ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
2081 
2082         if (zio->io_type == ZIO_TYPE_WRITE) {
2083                 vdev_raidz_generate_parity(rm);
2084 
2085                 for (c = 0; c < rm->rm_cols; c++) {
2086                         rc = &rm->rm_col[c];
2087                         cvd = vd->vdev_child[rc->rc_devidx];
2088                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2089                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2090                             zio->io_type, zio->io_priority, 0,
2091                             vdev_raidz_child_done, rc));
2092                 }
2093 
2094                 /*
2095                  * Generate optional I/Os for any skipped sectors to improve


2144                     (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
2145                         zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
2146                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2147                             zio->io_type, zio->io_priority, 0,
2148                             vdev_raidz_child_done, rc));
2149                 }
2150         }
2151 
2152         zio_execute(zio);
2153 }
2154 
2155 
2156 /*
2157  * Report a checksum error for a child of a RAID-Z device.
2158  */
2159 static void
2160 raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
2161 {
2162         void *buf;
2163         vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
2164         vdev_stat_t *vs = &vd->vdev_stat;
2165         spa_t *spa = zio->io_spa;
2166 
2167         if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
2168                 zio_bad_cksum_t zbc;
2169                 raidz_map_t *rm = zio->io_vsd;
2170 
2171                 mutex_enter(&vd->vdev_stat_lock);
2172                 vd->vdev_stat.vs_checksum_errors++;
2173                 mutex_exit(&vd->vdev_stat_lock);
2174 
2175                 zbc.zbc_has_cksum = 0;
2176                 zbc.zbc_injected = rm->rm_ecksuminjected;
2177 
2178                 buf = abd_borrow_buf_copy(rc->rc_abd, rc->rc_size);
2179                 zfs_ereport_post_checksum(zio->io_spa, vd, zio,
2180                     rc->rc_offset, rc->rc_size, buf, bad_data,
2181                     &zbc);
2182                 abd_return_buf(rc->rc_abd, buf, rc->rc_size);
2183         }
2184 
2185         if (vd->vdev_isspecial && (vs->vs_checksum_errors ||
2186             vs->vs_read_errors || vs->vs_write_errors) &&
2187             !spa->spa_special_has_errors) {
2188                 spa->spa_special_has_errors = B_TRUE;
2189         }
2190 }
2191 
2192 /*
2193  * We keep track of whether or not there were any injected errors, so that
2194  * any ereports we generate can note it.
2195  */
2196 static int
2197 raidz_checksum_verify(zio_t *zio)
2198 {
2199         zio_bad_cksum_t zbc;
2200         raidz_map_t *rm = zio->io_vsd;
2201 
2202         int ret = zio_checksum_error(zio, &zbc);
2203         if (ret != 0 && zbc.zbc_injected != 0)
2204                 rm->rm_ecksuminjected = 1;
2205 
2206         return (ret);
2207 }
2208 
2209 /*


2445  *      d. If that doesn't work, return an error.
2446  *   3. If there were unexpected errors or this is a resilver operation,
2447  *      rewrite the vdevs that had errors.
2448  */
2449 static void
2450 vdev_raidz_io_done(zio_t *zio)
2451 {
2452         vdev_t *vd = zio->io_vd;
2453         vdev_t *cvd;
2454         raidz_map_t *rm = zio->io_vsd;
2455         raidz_col_t *rc;
2456         int unexpected_errors = 0;
2457         int parity_errors = 0;
2458         int parity_untried = 0;
2459         int data_errors = 0;
2460         int total_errors = 0;
2461         int n, c;
2462         int tgts[VDEV_RAIDZ_MAXPARITY];
2463         int code;
2464 


2465         ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol);
2466         ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol);
2467 
2468         for (c = 0; c < rm->rm_cols; c++) {
2469                 rc = &rm->rm_col[c];
2470 
2471                 if (rc->rc_error) {
2472                         ASSERT(rc->rc_error != ECKSUM);      /* child has no bp */
2473 
2474                         if (c < rm->rm_firstdatacol)
2475                                 parity_errors++;
2476                         else
2477                                 data_errors++;
2478 
2479                         if (!rc->rc_skipped)
2480                                 unexpected_errors++;
2481 
2482                         total_errors++;
2483                 } else if (c < rm->rm_firstdatacol && !rc->rc_tried) {
2484                         parity_untried++;


2703                             rc->rc_offset, rc->rc_abd, rc->rc_size,
2704                             ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE,
2705                             ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
2706                             ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
2707                 }
2708         }
2709 }
2710 
2711 static void
2712 vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded)
2713 {
2714         if (faulted > vd->vdev_nparity)
2715                 vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
2716                     VDEV_AUX_NO_REPLICAS);
2717         else if (degraded + faulted != 0)
2718                 vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE);
2719         else
2720                 vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE);
2721 }
2722 
2723 static inline void
2724 vdev_raidz_trim_append_rc(dkioc_free_list_t *dfl, uint64_t *num_extsp,
2725     const raidz_col_t *rc)
2726 {
2727         uint64_t num_exts = *num_extsp;
2728         ASSERT(rc->rc_size != 0);
2729 
2730         if (dfl->dfl_num_exts > 0 &&
2731             dfl->dfl_exts[num_exts - 1].dfle_start +
2732             dfl->dfl_exts[num_exts - 1].dfle_length == rc->rc_offset) {
2733                 dfl->dfl_exts[num_exts - 1].dfle_length += rc->rc_size;
2734         } else {
2735                 dfl->dfl_exts[num_exts].dfle_start = rc->rc_offset;
2736                 dfl->dfl_exts[num_exts].dfle_length = rc->rc_size;
2737                 (*num_extsp)++;
2738         }
2739 }
2740 
2741 /*
2742  * Processes a trim for a raidz vdev.
2743  */
2744 static void
2745 vdev_raidz_trim(vdev_t *vd, zio_t *pio, void *trim_exts)
2746 {
2747         dkioc_free_list_t *dfl = trim_exts;
2748         dkioc_free_list_t **sub_dfls;
2749         uint64_t *sub_dfls_num_exts;
2750 
2751         sub_dfls = kmem_zalloc(sizeof (*sub_dfls) * vd->vdev_children,
2752             KM_SLEEP);
2753         sub_dfls_num_exts = kmem_zalloc(sizeof (uint64_t) * vd->vdev_children,
2754             KM_SLEEP);
2755         for (int i = 0; i < vd->vdev_children; i++) {
2756                 /*
2757                  * We might over-allocate here, because the sub-lists can never
2758                  * be longer than the parent list, but they can be shorter.
2759                  * The underlying driver will discard zero-length extents.
2760                  */
2761                 sub_dfls[i] = kmem_zalloc(DFL_SZ(dfl->dfl_num_exts), KM_SLEEP);
2762                 sub_dfls[i]->dfl_num_exts = dfl->dfl_num_exts;
2763                 sub_dfls[i]->dfl_flags = dfl->dfl_flags;
2764                 sub_dfls[i]->dfl_offset = dfl->dfl_offset;
2765                 /* don't copy the check func, because it isn't raidz-aware */
2766         }
2767 
2768         /*
2769          * Process all extents and redistribute them to the component vdevs
2770          * according to a computed raidz map geometry.
2771          */
2772         for (int i = 0; i < dfl->dfl_num_exts; i++) {
2773                 uint64_t start = dfl->dfl_exts[i].dfle_start;
2774                 uint64_t length = dfl->dfl_exts[i].dfle_length;
2775                 raidz_map_t *rm = vdev_raidz_map_alloc(NULL,
2776                     vdev_raidz_psize_floor(vd, length), start,
2777                     vd->vdev_top->vdev_ashift, vd->vdev_children,
2778                     vd->vdev_nparity, B_FALSE);
2779 
2780                 for (uint64_t j = 0; j < rm->rm_cols; j++) {
2781                         uint64_t devidx = rm->rm_col[j].rc_devidx;
2782                         vdev_raidz_trim_append_rc(sub_dfls[devidx],
2783                             &sub_dfls_num_exts[devidx], &rm->rm_col[j]);
2784                 }
2785                 vdev_raidz_map_free(rm);
2786         }
2787 
2788         /*
2789          * Issue the component ioctls as children of the parent zio.
2790          */
2791         for (int i = 0; i < vd->vdev_children; i++) {
2792                 if (sub_dfls_num_exts[i] != 0) {
2793                         zio_nowait(zio_ioctl(pio, vd->vdev_child[i]->vdev_spa,
2794                             vd->vdev_child[i], DKIOCFREE,
2795                             vdev_raidz_trim_done, sub_dfls[i],
2796                             ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE |
2797                             ZIO_FLAG_DONT_RETRY));
2798                 } else {
2799                         dfl_free(sub_dfls[i]);
2800                 }
2801         }
2802         kmem_free(sub_dfls, sizeof (*sub_dfls) * vd->vdev_children);
2803         kmem_free(sub_dfls_num_exts, sizeof (uint64_t) * vd->vdev_children);
2804 }
2805 
2806 /*
2807  * Releases a dkioc_free_list_t from ioctls issued to component devices in
2808  * vdev_raidz_dkioc_free.
2809  */
2810 static void
2811 vdev_raidz_trim_done(zio_t *zio)
2812 {
2813         ASSERT(zio->io_private != NULL);
2814         dfl_free(zio->io_private);
2815 }
2816 
2817 vdev_ops_t vdev_raidz_ops = {
2818         vdev_raidz_open,
2819         vdev_raidz_close,
2820         vdev_raidz_asize,
2821         vdev_raidz_io_start,
2822         vdev_raidz_io_done,
2823         vdev_raidz_state_change,
2824         NULL,
2825         NULL,
2826         vdev_raidz_trim,
2827         VDEV_TYPE_RAIDZ,        /* name of this vdev type */
2828         B_FALSE                 /* not a leaf vdev */
2829 };