Print this page
NEX-19083 backport OS-7314 zil_commit should omit cache thrash
9962 zil_commit should omit cache thrash
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Joshua M. Clulow <josh@sysmgr.org>
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6250 zvol_dump_init() can hold txg open
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Albert Lee <trisk@omniti.com>
Reviewed by: Xin Li <delphij@freebsd.org>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
NEX-3508 CLONE - Port NEX-2946 Add UNMAP/TRIM functionality to ZFS and illumos
Reviewed by: Josef Sipek <josef.sipek@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
Conflicts:
    usr/src/uts/common/io/scsi/targets/sd.c
    usr/src/uts/common/sys/scsi/targets/sddef.h
NEX-1142 move rwlock to vdev to protect vdev_tsd
not just ldi handle.
This way we serialize open/close, yet allow parallel I/O.
4370 avoid transmitting holes during zfs send
4371 DMU code clean up
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Josef 'Jeff' Sipek <jeffpc@josefsipek.net>
Approved by: Garrett D'Amore <garrett@damore.org>
NEX-1065 Added serialization to avoid race
between ldi notification and I/O path.
Also fixes OS-124, NEX-1051, NEX-1062.

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/zvol.c
          +++ new/usr/src/uts/common/fs/zfs/zvol.c
↓ open down ↓ 18 lines elided ↑ open up ↑
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   *
  24   24   * Portions Copyright 2010 Robert Milkowski
  25   25   *
  26   26   * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  27   27   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  28   28   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       29 + * Copyright 2017 Nexenta Systems, Inc.  All rights reserved.
  29   30   * Copyright (c) 2014 Integros [integros.com]
  30   31   */
  31   32  
  32   33  /*
  33   34   * ZFS volume emulation driver.
  34   35   *
  35   36   * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes.
  36   37   * Volumes are accessed through the symbolic links named:
  37   38   *
  38   39   * /dev/zvol/dsk/<pool_name>/<dataset_name>
↓ open down ↓ 42 lines elided ↑ open up ↑
  81   82  #include <sys/vdev_disk.h>
  82   83  #include <sys/vdev_impl.h>
  83   84  #include <sys/vdev_raidz.h>
  84   85  #include <sys/zvol.h>
  85   86  #include <sys/dumphdr.h>
  86   87  #include <sys/zil_impl.h>
  87   88  #include <sys/dbuf.h>
  88   89  #include <sys/dmu_tx.h>
  89   90  #include <sys/zfeature.h>
  90   91  #include <sys/zio_checksum.h>
       92 +#include <sys/dkioc_free_util.h>
  91   93  #include <sys/zil_impl.h>
  92   94  
  93   95  #include "zfs_namecheck.h"
  94   96  
  95   97  void *zfsdev_state;
  96   98  static char *zvol_tag = "zvol_tag";
  97   99  
  98  100  #define ZVOL_DUMPSIZE           "dumpsize"
  99  101  
 100  102  /*
↓ open down ↓ 864 lines elided ↑ open up ↑
 965  967          zv->zv_open_count[otyp]--;
 966  968          zv->zv_total_opens--;
 967  969  
 968  970          if (zv->zv_total_opens == 0)
 969  971                  zvol_last_close(zv);
 970  972  
 971  973          mutex_exit(&zfsdev_state_lock);
 972  974          return (error);
 973  975  }
 974  976  
      977 +/* ARGSUSED */
 975  978  static void
 976  979  zvol_get_done(zgd_t *zgd, int error)
 977  980  {
 978  981          if (zgd->zgd_db)
 979  982                  dmu_buf_rele(zgd->zgd_db, zgd);
 980  983  
 981  984          zfs_range_unlock(zgd->zgd_rl);
 982  985  
 983      -        if (error == 0 && zgd->zgd_bp)
 984      -                zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
 985      -
 986  986          kmem_free(zgd, sizeof (zgd_t));
 987  987  }
 988  988  
 989  989  /*
 990  990   * Get data to generate a TX_WRITE intent log record.
 991  991   */
 992  992  static int
 993  993  zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
 994  994  {
 995  995          zvol_state_t *zv = arg;
↓ open down ↓ 66 lines elided ↑ open up ↑
1062 1062   * Otherwise we will later flush the data out via dmu_sync().
1063 1063   */
1064 1064  ssize_t zvol_immediate_write_sz = 32768;
1065 1065  
1066 1066  static void
1067 1067  zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
1068 1068      boolean_t sync)
1069 1069  {
1070 1070          uint32_t blocksize = zv->zv_volblocksize;
1071 1071          zilog_t *zilog = zv->zv_zilog;
     1072 +        spa_t *spa = zilog->zl_spa;
     1073 +        spa_meta_placement_t *mp = &spa->spa_meta_policy;
     1074 +        boolean_t slogging, zil_to_special, write_to_special;
     1075 +        ssize_t immediate_write_sz;
1072 1076          itx_wr_state_t write_state;
1073 1077  
1074 1078          if (zil_replaying(zilog, tx))
1075 1079                  return;
1076 1080  
1077      -        if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
     1081 +        /*
     1082 +         * See comments in zfs_log_write()
     1083 +         */
     1084 +
     1085 +        immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
     1086 +            ? 0 : zvol_immediate_write_sz;
     1087 +
     1088 +        zil_to_special = !spa_has_slogs(spa) &&
     1089 +            spa_can_special_be_used(spa) &&
     1090 +            mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
     1091 +
     1092 +        write_to_special = !spa_has_slogs(spa) &&
     1093 +            spa_write_data_to_special(spa, zilog->zl_os) &&
     1094 +            (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
     1095 +            (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
     1096 +            spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
     1097 +
     1098 +        slogging = (spa_has_slogs(spa) || zil_to_special) &&
     1099 +            (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
     1100 +
     1101 +        if (blocksize > immediate_write_sz && !slogging &&
     1102 +            resid >= blocksize && off % blocksize == 0)
1078 1103                  write_state = WR_INDIRECT;
     1104 +        else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
     1105 +                write_state = WR_INDIRECT;
1079 1106          else if (!spa_has_slogs(zilog->zl_spa) &&
1080 1107              resid >= blocksize && blocksize > zvol_immediate_write_sz)
1081 1108                  write_state = WR_INDIRECT;
     1109 +        else if (write_to_special)
     1110 +                 write_state = WR_INDIRECT;
1082 1111          else if (sync)
1083 1112                  write_state = WR_COPIED;
1084 1113          else
1085 1114                  write_state = WR_NEED_COPY;
1086 1115  
1087 1116          while (resid) {
1088 1117                  itx_t *itx;
1089 1118                  lr_write_t *lr;
1090 1119                  itx_wr_state_t wr_state = write_state;
1091 1120                  ssize_t len = resid;
↓ open down ↓ 29 lines elided ↑ open up ↑
1121 1150                  off += len;
1122 1151                  resid -= len;
1123 1152          }
1124 1153  }
1125 1154  
1126 1155  static int
1127 1156  zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t origoffset,
1128 1157      uint64_t size, boolean_t doread, boolean_t isdump)
1129 1158  {
1130 1159          vdev_disk_t *dvd;
1131      -        int c;
     1160 +        int c, rc;
1132 1161          int numerrors = 0;
1133 1162  
1134 1163          if (vd->vdev_ops == &vdev_mirror_ops ||
1135 1164              vd->vdev_ops == &vdev_replacing_ops ||
1136 1165              vd->vdev_ops == &vdev_spare_ops) {
1137 1166                  for (c = 0; c < vd->vdev_children; c++) {
1138 1167                          int err = zvol_dumpio_vdev(vd->vdev_child[c],
1139 1168                              addr, offset, origoffset, size, doread, isdump);
1140 1169                          if (err != 0) {
1141 1170                                  numerrors++;
↓ open down ↓ 11 lines elided ↑ open up ↑
1153 1182          else if (!doread && !vdev_writeable(vd))
1154 1183                  return (SET_ERROR(EIO));
1155 1184  
1156 1185          if (vd->vdev_ops == &vdev_raidz_ops) {
1157 1186                  return (vdev_raidz_physio(vd,
1158 1187                      addr, size, offset, origoffset, doread, isdump));
1159 1188          }
1160 1189  
1161 1190          offset += VDEV_LABEL_START_SIZE;
1162 1191  
     1192 +        rw_enter(&vd->vdev_tsd_lock, RW_READER);
     1193 +        dvd = vd->vdev_tsd;
1163 1194          if (ddi_in_panic() || isdump) {
1164 1195                  ASSERT(!doread);
1165      -                if (doread)
     1196 +                if (doread) {
     1197 +                        rw_exit(&vd->vdev_tsd_lock);
1166 1198                          return (SET_ERROR(EIO));
1167      -                dvd = vd->vdev_tsd;
     1199 +                }
     1200 +                /* We assume here dvd is not NULL */
1168 1201                  ASSERT3P(dvd, !=, NULL);
1169      -                return (ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
1170      -                    lbtodb(size)));
     1202 +
     1203 +                /* If our assumption is wrong, we do not want to crash */
     1204 +                if (dvd != NULL && dvd->vd_lh != NULL) {
     1205 +                        rc = ldi_dump(dvd->vd_lh, addr, lbtodb(offset),
     1206 +                            lbtodb(size));
     1207 +                } else {
     1208 +                        rc = SET_ERROR(ENXIO);
     1209 +                }
1171 1210          } else {
1172      -                dvd = vd->vdev_tsd;
     1211 +                /* We assume here dvd is not NULL */
1173 1212                  ASSERT3P(dvd, !=, NULL);
1174      -                return (vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
1175      -                    offset, doread ? B_READ : B_WRITE));
     1213 +
     1214 +                /* If our assumption is wrong, we do not want to crash */
     1215 +                if (dvd != NULL && dvd->vd_lh != NULL) {
     1216 +                        rc = vdev_disk_ldi_physio(dvd->vd_lh, addr, size,
     1217 +                            offset, doread ? B_READ : B_WRITE);
     1218 +                } else {
     1219 +                        rc = SET_ERROR(ENXIO);
     1220 +                }
1176 1221          }
     1222 +        rw_exit(&vd->vdev_tsd_lock);
     1223 +        return (rc);
1177 1224  }
1178 1225  
1179 1226  static int
1180 1227  zvol_dumpio(zvol_state_t *zv, void *addr, uint64_t offset, uint64_t size,
1181 1228      boolean_t doread, boolean_t isdump)
1182 1229  {
1183 1230          vdev_t *vd;
1184 1231          int error;
1185 1232          zvol_extent_t *ze;
1186 1233          spa_t *spa = dmu_objset_spa(zv->zv_objset);
↓ open down ↓ 586 lines elided ↑ open up ↑
1773 1820                  if (!(zv->zv_flags & ZVOL_DUMPIFIED))
1774 1821                          break;
1775 1822                  rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
1776 1823                      RL_WRITER);
1777 1824                  error = zvol_dump_fini(zv);
1778 1825                  zfs_range_unlock(rl);
1779 1826                  break;
1780 1827  
1781 1828          case DKIOCFREE:
1782 1829          {
1783      -                dkioc_free_t df;
     1830 +                dkioc_free_list_t *dfl;
1784 1831                  dmu_tx_t *tx;
1785 1832  
     1833 +                mutex_exit(&zfsdev_state_lock);
     1834 +
1786 1835                  if (!zvol_unmap_enabled)
1787 1836                          break;
1788 1837  
1789      -                if (ddi_copyin((void *)arg, &df, sizeof (df), flag)) {
1790      -                        error = SET_ERROR(EFAULT);
1791      -                        break;
     1838 +                if (!(flag & FKIOCTL)) {
     1839 +                        dfl = dfl_copyin((void *)arg, flag, KM_SLEEP);
     1840 +                        if (dfl == NULL) {
     1841 +                                error = SET_ERROR(EFAULT);
     1842 +                                break;
     1843 +                        }
     1844 +                } else {
     1845 +                        dfl = (dkioc_free_list_t *)arg;
1792 1846                  }
1793 1847  
1794      -                /*
1795      -                 * Apply Postel's Law to length-checking.  If they overshoot,
1796      -                 * just blank out until the end, if there's a need to blank
1797      -                 * out anything.
1798      -                 */
1799      -                if (df.df_start >= zv->zv_volsize)
1800      -                        break;  /* No need to do anything... */
     1848 +                for (int i = 0; i < dfl->dfl_num_exts; i++) {
     1849 +                        uint64_t start = dfl->dfl_exts[i].dfle_start,
     1850 +                            length = dfl->dfl_exts[i].dfle_length,
     1851 +                            end = start + length;
1801 1852  
1802      -                mutex_exit(&zfsdev_state_lock);
     1853 +                        /*
     1854 +                         * Apply Postel's Law to length-checking.  If they
     1855 +                         * overshoot, just blank out until the end, if there's
     1856 +                         * a need to blank out anything.
     1857 +                         */
     1858 +                        if (start >= zv->zv_volsize)
     1859 +                                continue;       /* No need to do anything... */
     1860 +                        if (end > zv->zv_volsize) {
     1861 +                                end = DMU_OBJECT_END;
     1862 +                                length = end - start;
     1863 +                        }
1803 1864  
1804      -                rl = zfs_range_lock(&zv->zv_znode, df.df_start, df.df_length,
1805      -                    RL_WRITER);
1806      -                tx = dmu_tx_create(zv->zv_objset);
1807      -                dmu_tx_mark_netfree(tx);
1808      -                error = dmu_tx_assign(tx, TXG_WAIT);
1809      -                if (error != 0) {
1810      -                        dmu_tx_abort(tx);
1811      -                } else {
1812      -                        zvol_log_truncate(zv, tx, df.df_start,
1813      -                            df.df_length, B_TRUE);
1814      -                        dmu_tx_commit(tx);
1815      -                        error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ,
1816      -                            df.df_start, df.df_length);
1817      -                }
     1865 +                        rl = zfs_range_lock(&zv->zv_znode, start, length,
     1866 +                            RL_WRITER);
     1867 +                        tx = dmu_tx_create(zv->zv_objset);
     1868 +                        error = dmu_tx_assign(tx, TXG_WAIT);
     1869 +                        if (error != 0) {
     1870 +                                dmu_tx_abort(tx);
     1871 +                        } else {
     1872 +                                zvol_log_truncate(zv, tx, start, length,
     1873 +                                    B_TRUE);
     1874 +                                dmu_tx_commit(tx);
     1875 +                                error = dmu_free_long_range(zv->zv_objset,
     1876 +                                    ZVOL_OBJ, start, length);
     1877 +                        }
1818 1878  
1819      -                zfs_range_unlock(rl);
     1879 +                        zfs_range_unlock(rl);
1820 1880  
     1881 +                        if (error != 0)
     1882 +                                break;
     1883 +                }
     1884 +
1821 1885                  /*
1822 1886                   * If the write-cache is disabled, 'sync' property
1823 1887                   * is set to 'always', or if the caller is asking for
1824 1888                   * a synchronous free, commit this operation to the zil.
1825 1889                   * This will sync any previous uncommitted writes to the
1826 1890                   * zvol object.
1827 1891                   * Can be overridden by the zvol_unmap_sync_enabled tunable.
1828 1892                   */
1829 1893                  if ((error == 0) && zvol_unmap_sync_enabled &&
1830 1894                      (!(zv->zv_flags & ZVOL_WCE) ||
1831 1895                      (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS) ||
1832      -                    (df.df_flags & DF_WAIT_SYNC))) {
     1896 +                    (dfl->dfl_flags & DF_WAIT_SYNC))) {
1833 1897                          zil_commit(zv->zv_zilog, ZVOL_OBJ);
1834 1898                  }
1835 1899  
     1900 +                if (!(flag & FKIOCTL))
     1901 +                        dfl_free(dfl);
     1902 +
1836 1903                  return (error);
1837 1904          }
1838 1905  
1839 1906          default:
1840 1907                  error = SET_ERROR(ENOTTY);
1841 1908                  break;
1842 1909  
1843 1910          }
1844 1911          mutex_exit(&zfsdev_state_lock);
1845 1912          return (error);
↓ open down ↓ 326 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX