Print this page
usr/src/uts/common/fs/zfs/ddt.c

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dmu.c
          +++ new/usr/src/uts/common/fs/zfs/dmu.c
↓ open down ↓ 37 lines elided ↑ open up ↑
  38   38  #include <sys/dsl_dir.h>
  39   39  #include <sys/dsl_pool.h>
  40   40  #include <sys/dsl_synctask.h>
  41   41  #include <sys/dsl_prop.h>
  42   42  #include <sys/dmu_zfetch.h>
  43   43  #include <sys/zfs_ioctl.h>
  44   44  #include <sys/zap.h>
  45   45  #include <sys/zio_checksum.h>
  46   46  #include <sys/zio_compress.h>
  47   47  #include <sys/sa.h>
       48 +#include <sys/spa_impl.h>
  48   49  #include <sys/zfeature.h>
  49   50  #include <sys/abd.h>
  50   51  #ifdef _KERNEL
  51   52  #include <sys/vmsystm.h>
  52   53  #include <sys/zfs_znode.h>
       54 +#include <sys/zfs_vfsops.h>
  53   55  #endif
       56 +#include <sys/special.h>
  54   57  
  55   58  /*
  56   59   * Enable/disable nopwrite feature.
  57   60   */
  58   61  int zfs_nopwrite_enabled = 1;
  59   62  
  60   63  /*
  61   64   * Tunable to control percentage of dirtied blocks from frees in one TXG.
  62   65   * After this threshold is crossed, additional dirty blocks from frees
  63   66   * wait until the next TXG.
  64   67   * A value of zero will disable this throttle.
  65   68   */
  66   69  uint32_t zfs_per_txg_dirty_frees_percent = 30;
  67   70  
  68      -/*
  69      - * This can be used for testing, to ensure that certain actions happen
  70      - * while in the middle of a remap (which might otherwise complete too
  71      - * quickly).
  72      - */
  73      -int zfs_object_remap_one_indirect_delay_ticks = 0;
  74      -
  75   71  const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
  76      -        {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
  77      -        {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
  78      -        {       DMU_BSWAP_UINT64,       TRUE,   "object array"          },
  79      -        {       DMU_BSWAP_UINT8,        TRUE,   "packed nvlist"         },
  80      -        {       DMU_BSWAP_UINT64,       TRUE,   "packed nvlist size"    },
  81      -        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj"                 },
  82      -        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj header"          },
  83      -        {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map header"  },
  84      -        {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map"         },
  85      -        {       DMU_BSWAP_UINT64,       TRUE,   "ZIL intent log"        },
  86      -        {       DMU_BSWAP_DNODE,        TRUE,   "DMU dnode"             },
  87      -        {       DMU_BSWAP_OBJSET,       TRUE,   "DMU objset"            },
  88      -        {       DMU_BSWAP_UINT64,       TRUE,   "DSL directory"         },
  89      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL directory child map"},
  90      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset snap map"  },
  91      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL props"             },
  92      -        {       DMU_BSWAP_UINT64,       TRUE,   "DSL dataset"           },
  93      -        {       DMU_BSWAP_ZNODE,        TRUE,   "ZFS znode"             },
  94      -        {       DMU_BSWAP_OLDACL,       TRUE,   "ZFS V0 ACL"            },
  95      -        {       DMU_BSWAP_UINT8,        FALSE,  "ZFS plain file"        },
  96      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS directory"         },
  97      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS master node"       },
  98      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS delete queue"      },
  99      -        {       DMU_BSWAP_UINT8,        FALSE,  "zvol object"           },
 100      -        {       DMU_BSWAP_ZAP,          TRUE,   "zvol prop"             },
 101      -        {       DMU_BSWAP_UINT8,        FALSE,  "other uint8[]"         },
 102      -        {       DMU_BSWAP_UINT64,       FALSE,  "other uint64[]"        },
 103      -        {       DMU_BSWAP_ZAP,          TRUE,   "other ZAP"             },
 104      -        {       DMU_BSWAP_ZAP,          TRUE,   "persistent error log"  },
 105      -        {       DMU_BSWAP_UINT8,        TRUE,   "SPA history"           },
 106      -        {       DMU_BSWAP_UINT64,       TRUE,   "SPA history offsets"   },
 107      -        {       DMU_BSWAP_ZAP,          TRUE,   "Pool properties"       },
 108      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL permissions"       },
 109      -        {       DMU_BSWAP_ACL,          TRUE,   "ZFS ACL"               },
 110      -        {       DMU_BSWAP_UINT8,        TRUE,   "ZFS SYSACL"            },
 111      -        {       DMU_BSWAP_UINT8,        TRUE,   "FUID table"            },
 112      -        {       DMU_BSWAP_UINT64,       TRUE,   "FUID table size"       },
 113      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset next clones"},
 114      -        {       DMU_BSWAP_ZAP,          TRUE,   "scan work queue"       },
 115      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group used"   },
 116      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group quota"  },
 117      -        {       DMU_BSWAP_ZAP,          TRUE,   "snapshot refcount tags"},
 118      -        {       DMU_BSWAP_ZAP,          TRUE,   "DDT ZAP algorithm"     },
 119      -        {       DMU_BSWAP_ZAP,          TRUE,   "DDT statistics"        },
 120      -        {       DMU_BSWAP_UINT8,        TRUE,   "System attributes"     },
 121      -        {       DMU_BSWAP_ZAP,          TRUE,   "SA master node"        },
 122      -        {       DMU_BSWAP_ZAP,          TRUE,   "SA attr registration"  },
 123      -        {       DMU_BSWAP_ZAP,          TRUE,   "SA attr layouts"       },
 124      -        {       DMU_BSWAP_ZAP,          TRUE,   "scan translations"     },
 125      -        {       DMU_BSWAP_UINT8,        FALSE,  "deduplicated block"    },
 126      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL deadlist map"      },
 127      -        {       DMU_BSWAP_UINT64,       TRUE,   "DSL deadlist map hdr"  },
 128      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dir clones"        },
 129      -        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj subobj"          }
       72 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"                },
       73 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"           },
       74 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"               },
       75 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"              },
       76 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"         },
       77 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"                      },
       78 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"               },
       79 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"       },
       80 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"              },
       81 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"             },
       82 +        { DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"                  },
       83 +        { DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"                 },
       84 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"              },
       85 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"    },
       86 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"       },
       87 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"                  },
       88 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"                },
       89 +        { DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"                  },
       90 +        { DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"                 },
       91 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"             },
       92 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"              },
       93 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"            },
       94 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"           },
       95 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"                },
       96 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "zvol prop"                  },
       97 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"              },
       98 +        { DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"             },
       99 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"                  },
      100 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"       },
      101 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"                },
      102 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"        },
      103 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"            },
      104 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"            },
      105 +        { DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"                    },
      106 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"                 },
      107 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"                 },
      108 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"            },
      109 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"    },
      110 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"            },
      111 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"        },
      112 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"       },
      113 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"     },
      114 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"          },
      115 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"             },
      116 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"          },
      117 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"             },
      118 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"       },
      119 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"            },
      120 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"          },
      121 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"         },
      122 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"           },
      123 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"       },
      124 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"             },
      125 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"               }
 130  126  };
 131  127  
 132  128  const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 133  129          {       byteswap_uint8_array,   "uint8"         },
 134  130          {       byteswap_uint16_array,  "uint16"        },
 135  131          {       byteswap_uint32_array,  "uint32"        },
 136  132          {       byteswap_uint64_array,  "uint64"        },
 137  133          {       zap_byteswap,           "zap"           },
 138  134          {       dnode_buf_byteswap,     "dnode"         },
 139  135          {       dmu_objset_byteswap,    "objset"        },
↓ open down ↓ 565 lines elided ↑ open up ↑
 705  701  
 706  702                  /* set start to the beginning of this L1 indirect */
 707  703                  *start = P2ALIGN(*start, iblkrange);
 708  704          }
 709  705          if (*start < minimum)
 710  706                  *start = minimum;
 711  707          return (0);
 712  708  }
 713  709  
 714  710  /*
 715      - * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
 716      - * otherwise return false.
 717      - * Used below in dmu_free_long_range_impl() to enable abort when unmounting
      711 + * If this dnode is in the ZFS object set
      712 + * return true if vfs's unmounted flag is set or the
      713 + * zfsvfs is currently suspended, otherwise return false.
 718  714   */
 719  715  /*ARGSUSED*/
 720  716  static boolean_t
 721      -dmu_objset_zfs_unmounting(objset_t *os)
      717 +dmu_dnode_fs_unmounting_or_suspended(dnode_t *freeing_dn)
 722  718  {
 723  719  #ifdef _KERNEL
 724      -        if (dmu_objset_type(os) == DMU_OST_ZFS)
 725      -                return (zfs_get_vfs_flag_unmounted(os));
 726      -#endif
      720 +        boolean_t busy = B_FALSE;
      721 +        objset_t *os = freeing_dn->dn_objset;
      722 +        zfsvfs_t *zfsvfs;
      723 +
      724 +        if (dmu_objset_type(os) == DMU_OST_ZFS) {
      725 +                mutex_enter(&os->os_user_ptr_lock);
      726 +                zfsvfs = dmu_objset_get_user(os);
      727 +                if (zfsvfs != NULL && zfsvfs->z_vfs != NULL &&
      728 +                    ((zfsvfs->z_vfs->vfs_flag & VFS_UNMOUNTED) ||
      729 +                     zfsvfs->z_busy))
      730 +                        busy = B_TRUE;
      731 +                mutex_exit(&os->os_user_ptr_lock);
      732 +        }
      733 +
      734 +        return (busy);
      735 +#else
 727  736          return (B_FALSE);
      737 +#endif
 728  738  }
 729  739  
 730  740  static int
 731  741  dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 732  742      uint64_t length)
 733  743  {
 734  744          uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 735  745          int err;
 736  746          uint64_t dirty_frees_threshold;
 737  747          dsl_pool_t *dp = dmu_objset_pool(os);
 738  748  
 739  749          if (offset >= object_size)
 740  750                  return (0);
 741  751  
 742  752          if (zfs_per_txg_dirty_frees_percent <= 100)
 743  753                  dirty_frees_threshold =
 744  754                      zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 745  755          else
 746  756                  dirty_frees_threshold = zfs_dirty_data_max / 4;
 747  757  
      758 +        if (length == DMU_OBJECT_END && offset == 0)
      759 +                dnode_evict_dbufs(dn, 0);
      760 +
 748  761          if (length == DMU_OBJECT_END || offset + length > object_size)
 749  762                  length = object_size - offset;
 750  763  
      764 +        mutex_enter(&dp->dp_lock);
      765 +        dp->dp_long_freeing_total += length;
      766 +        mutex_exit(&dp->dp_lock);
      767 +
 751  768          while (length != 0) {
 752  769                  uint64_t chunk_end, chunk_begin, chunk_len;
 753  770                  uint64_t long_free_dirty_all_txgs = 0;
 754  771                  dmu_tx_t *tx;
 755  772  
 756      -                if (dmu_objset_zfs_unmounting(dn->dn_objset))
      773 +                if (dmu_dnode_fs_unmounting_or_suspended(dn)) {
      774 +                        mutex_enter(&dp->dp_lock);
      775 +                        dp->dp_long_freeing_total -= length;
      776 +                        mutex_exit(&dp->dp_lock);
      777 +
 757  778                          return (SET_ERROR(EINTR));
      779 +                }
 758  780  
 759  781                  chunk_end = chunk_begin = offset + length;
 760  782  
 761  783                  /* move chunk_begin backwards to the beginning of this chunk */
 762  784                  err = get_next_chunk(dn, &chunk_begin, offset);
 763  785                  if (err)
 764  786                          return (err);
 765  787                  ASSERT3U(chunk_begin, >=, offset);
 766  788                  ASSERT3U(chunk_begin, <=, chunk_end);
 767  789  
↓ open down ↓ 21 lines elided ↑ open up ↑
 789  811                  dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 790  812  
 791  813                  /*
 792  814                   * Mark this transaction as typically resulting in a net
 793  815                   * reduction in space used.
 794  816                   */
 795  817                  dmu_tx_mark_netfree(tx);
 796  818                  err = dmu_tx_assign(tx, TXG_WAIT);
 797  819                  if (err) {
 798  820                          dmu_tx_abort(tx);
      821 +                        mutex_enter(&dp->dp_lock);
      822 +                        dp->dp_long_freeing_total -= length - chunk_len;
      823 +                        mutex_exit(&dp->dp_lock);
 799  824                          return (err);
 800  825                  }
 801  826  
 802  827                  mutex_enter(&dp->dp_lock);
 803  828                  dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
 804  829                      chunk_len;
 805  830                  mutex_exit(&dp->dp_lock);
 806  831                  DTRACE_PROBE3(free__long__range,
 807  832                      uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
 808  833                      uint64_t, dmu_tx_get_txg(tx));
↓ open down ↓ 205 lines elided ↑ open up ↑
1014 1039  
1015 1040          if (size == 0)
1016 1041                  return;
1017 1042  
1018 1043          VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1019 1044              FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1020 1045          dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1021 1046          dmu_buf_rele_array(dbp, numbufs, FTAG);
1022 1047  }
1023 1048  
1024      -static int
1025      -dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
1026      -    uint64_t last_removal_txg, uint64_t offset)
1027      -{
1028      -        uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
1029      -        int err = 0;
1030      -
1031      -        rw_enter(&dn->dn_struct_rwlock, RW_READER);
1032      -        dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1033      -        ASSERT3P(dbuf, !=, NULL);
1034      -
1035      -        /*
1036      -         * If the block hasn't been written yet, this default will ensure
1037      -         * we don't try to remap it.
1038      -         */
1039      -        uint64_t birth = UINT64_MAX;
1040      -        ASSERT3U(last_removal_txg, !=, UINT64_MAX);
1041      -        if (dbuf->db_blkptr != NULL)
1042      -                birth = dbuf->db_blkptr->blk_birth;
1043      -        rw_exit(&dn->dn_struct_rwlock);
1044      -
1045      -        /*
1046      -         * If this L1 was already written after the last removal, then we've
1047      -         * already tried to remap it.
1048      -         */
1049      -        if (birth <= last_removal_txg &&
1050      -            dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
1051      -            dbuf_can_remap(dbuf)) {
1052      -                dmu_tx_t *tx = dmu_tx_create(os);
1053      -                dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
1054      -                err = dmu_tx_assign(tx, TXG_WAIT);
1055      -                if (err == 0) {
1056      -                        (void) dbuf_dirty(dbuf, tx);
1057      -                        dmu_tx_commit(tx);
1058      -                } else {
1059      -                        dmu_tx_abort(tx);
1060      -                }
1061      -        }
1062      -
1063      -        dbuf_rele(dbuf, FTAG);
1064      -
1065      -        delay(zfs_object_remap_one_indirect_delay_ticks);
1066      -
1067      -        return (err);
1068      -}
1069      -
1070      -/*
1071      - * Remap all blockpointers in the object, if possible, so that they reference
1072      - * only concrete vdevs.
1073      - *
1074      - * To do this, iterate over the L0 blockpointers and remap any that reference
1075      - * an indirect vdev. Note that we only examine L0 blockpointers; since we
1076      - * cannot guarantee that we can remap all blockpointer anyways (due to split
1077      - * blocks), we do not want to make the code unnecessarily complicated to
1078      - * catch the unlikely case that there is an L1 block on an indirect vdev that
1079      - * contains no indirect blockpointers.
1080      - */
1081      -int
1082      -dmu_object_remap_indirects(objset_t *os, uint64_t object,
1083      -    uint64_t last_removal_txg)
1084      -{
1085      -        uint64_t offset, l1span;
1086      -        int err;
1087      -        dnode_t *dn;
1088      -
1089      -        err = dnode_hold(os, object, FTAG, &dn);
1090      -        if (err != 0) {
1091      -                return (err);
1092      -        }
1093      -
1094      -        if (dn->dn_nlevels <= 1) {
1095      -                if (issig(JUSTLOOKING) && issig(FORREAL)) {
1096      -                        err = SET_ERROR(EINTR);
1097      -                }
1098      -
1099      -                /*
1100      -                 * If the dnode has no indirect blocks, we cannot dirty them.
1101      -                 * We still want to remap the blkptr(s) in the dnode if
1102      -                 * appropriate, so mark it as dirty.
1103      -                 */
1104      -                if (err == 0 && dnode_needs_remap(dn)) {
1105      -                        dmu_tx_t *tx = dmu_tx_create(os);
1106      -                        dmu_tx_hold_bonus(tx, dn->dn_object);
1107      -                        if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
1108      -                                dnode_setdirty(dn, tx);
1109      -                                dmu_tx_commit(tx);
1110      -                        } else {
1111      -                                dmu_tx_abort(tx);
1112      -                        }
1113      -                }
1114      -
1115      -                dnode_rele(dn, FTAG);
1116      -                return (err);
1117      -        }
1118      -
1119      -        offset = 0;
1120      -        l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
1121      -            dn->dn_datablkshift);
1122      -        /*
1123      -         * Find the next L1 indirect that is not a hole.
1124      -         */
1125      -        while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
1126      -                if (issig(JUSTLOOKING) && issig(FORREAL)) {
1127      -                        err = SET_ERROR(EINTR);
1128      -                        break;
1129      -                }
1130      -                if ((err = dmu_object_remap_one_indirect(os, dn,
1131      -                    last_removal_txg, offset)) != 0) {
1132      -                        break;
1133      -                }
1134      -                offset += l1span;
1135      -        }
1136      -
1137      -        dnode_rele(dn, FTAG);
1138      -        return (err);
1139      -}
1140      -
1141 1049  void
1142 1050  dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1143 1051      dmu_tx_t *tx)
1144 1052  {
1145 1053          dmu_buf_t **dbp;
1146 1054          int numbufs, i;
1147 1055  
1148 1056          if (size == 0)
1149 1057                  return;
1150 1058  
↓ open down ↓ 529 lines elided ↑ open up ↑
1680 1588          dmu_sync_ready(zio, NULL, zio->io_private);
1681 1589  }
1682 1590  
1683 1591  /* ARGSUSED */
1684 1592  static void
1685 1593  dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1686 1594  {
1687 1595          dmu_sync_arg_t *dsa = varg;
1688 1596          dbuf_dirty_record_t *dr = dsa->dsa_dr;
1689 1597          dmu_buf_impl_t *db = dr->dr_dbuf;
     1598 +        zgd_t *zgd = dsa->dsa_zgd;
1690 1599  
     1600 +        /*
     1601 +         * Record the vdev(s) backing this blkptr so they can be flushed after
     1602 +         * the writes for the lwb have completed.
     1603 +         */
     1604 +        if (zio->io_error == 0) {
     1605 +                zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
     1606 +        }
     1607 +
1691 1608          mutex_enter(&db->db_mtx);
1692 1609          ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1693 1610          if (zio->io_error == 0) {
1694 1611                  dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1695 1612                  if (dr->dt.dl.dr_nopwrite) {
1696 1613                          blkptr_t *bp = zio->io_bp;
1697 1614                          blkptr_t *bp_orig = &zio->io_bp_orig;
1698 1615                          uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1699 1616  
1700 1617                          ASSERT(BP_EQUAL(bp, bp_orig));
↓ open down ↓ 29 lines elided ↑ open up ↑
1730 1647  
1731 1648          kmem_free(dsa, sizeof (*dsa));
1732 1649  }
1733 1650  
1734 1651  static void
1735 1652  dmu_sync_late_arrival_done(zio_t *zio)
1736 1653  {
1737 1654          blkptr_t *bp = zio->io_bp;
1738 1655          dmu_sync_arg_t *dsa = zio->io_private;
1739 1656          blkptr_t *bp_orig = &zio->io_bp_orig;
     1657 +        zgd_t *zgd = dsa->dsa_zgd;
1740 1658  
1741      -        if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1742      -                ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
1743      -                ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1744      -                ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1745      -                ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1746      -                zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
     1659 +        if (zio->io_error == 0) {
     1660 +                /*
     1661 +                 * Record the vdev(s) backing this blkptr so they can be
     1662 +                 * flushed after the writes for the lwb have completed.
     1663 +                 */
     1664 +                zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
     1665 +
     1666 +                if (!BP_IS_HOLE(bp)) {
     1667 +                        ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
     1668 +                        ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
     1669 +                        ASSERT(zio->io_bp->blk_birth == zio->io_txg);
     1670 +                        ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
     1671 +                        zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
     1672 +                }
1747 1673          }
1748 1674  
1749 1675          dmu_tx_commit(dsa->dsa_tx);
1750 1676  
1751 1677          dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1752 1678  
1753 1679          abd_put(zio->io_abd);
1754 1680          kmem_free(dsa, sizeof (*dsa));
1755 1681  }
1756 1682  
1757 1683  static int
1758 1684  dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1759      -    zio_prop_t *zp, zbookmark_phys_t *zb)
     1685 +    zio_prop_t *zp, zbookmark_phys_t *zb, const zio_smartcomp_info_t *sc)
1760 1686  {
1761 1687          dmu_sync_arg_t *dsa;
1762 1688          dmu_tx_t *tx;
1763 1689  
1764 1690          tx = dmu_tx_create(os);
1765 1691          dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1766 1692          if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1767 1693                  dmu_tx_abort(tx);
1768 1694                  /* Make zl_get_data do txg_waited_synced() */
1769 1695                  return (SET_ERROR(EIO));
↓ open down ↓ 32 lines elided ↑ open up ↑
1802 1728           * would need to check that this dbuf is not dirty in any future
1803 1729           * txg's (as we do in the normal dmu_sync() path). For simplicity, we
1804 1730           * don't nopwrite in this case.
1805 1731           */
1806 1732          zp->zp_nopwrite = B_FALSE;
1807 1733  
1808 1734          zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1809 1735              abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
1810 1736              zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
1811 1737              dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
1812      -            dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
     1738 +            dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb, sc));
1813 1739  
1814 1740          return (0);
1815 1741  }
1816 1742  
1817 1743  /*
1818 1744   * Intent log support: sync the block associated with db to disk.
1819 1745   * N.B. and XXX: the caller is responsible for making sure that the
1820 1746   * data isn't changing while dmu_sync() is writing it.
1821 1747   *
1822 1748   * Return values:
↓ open down ↓ 9 lines elided ↑ open up ↑
1832 1758   *
1833 1759   *      EIO: could not do the I/O.
1834 1760   *              The caller should do a txg_wait_synced().
1835 1761   *
1836 1762   *      0: the I/O has been initiated.
1837 1763   *              The caller should log this blkptr in the done callback.
1838 1764   *              It is possible that the I/O will fail, in which case
1839 1765   *              the error will be reported to the done callback and
1840 1766   *              propagated to pio from zio_done().
1841 1767   */
     1768 +
1842 1769  int
1843 1770  dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1844 1771  {
1845 1772          dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1846 1773          objset_t *os = db->db_objset;
1847 1774          dsl_dataset_t *ds = os->os_dsl_dataset;
1848 1775          dbuf_dirty_record_t *dr;
1849 1776          dmu_sync_arg_t *dsa;
1850 1777          zbookmark_phys_t zb;
1851 1778          zio_prop_t zp;
1852 1779          dnode_t *dn;
     1780 +        int flags = 0;
     1781 +        zio_smartcomp_info_t sc;
1853 1782  
1854 1783          ASSERT(pio != NULL);
1855 1784          ASSERT(txg != 0);
1856 1785  
1857 1786          SET_BOOKMARK(&zb, ds->ds_object,
1858 1787              db->db.db_object, db->db_level, db->db_blkid);
1859 1788  
     1789 +        /* write to special only if proper conditions hold */
     1790 +        if (spa_write_data_to_special(os->os_spa, os))
     1791 +                WP_SET_SPECIALCLASS(flags, B_TRUE);
     1792 +
1860 1793          DB_DNODE_ENTER(db);
1861 1794          dn = DB_DNODE(db);
1862      -        dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
     1795 +        dmu_write_policy(os, dn, db->db_level, flags | WP_DMU_SYNC, &zp);
     1796 +        dnode_setup_zio_smartcomp(db, &sc);
1863 1797          DB_DNODE_EXIT(db);
1864 1798  
1865 1799          /*
1866 1800           * If we're frozen (running ziltest), we always need to generate a bp.
1867 1801           */
1868 1802          if (txg > spa_freeze_txg(os->os_spa))
1869      -                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
     1803 +                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb,
     1804 +                    &sc));
1870 1805  
1871 1806          /*
1872 1807           * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1873 1808           * and us.  If we determine that this txg is not yet syncing,
1874 1809           * but it begins to sync a moment later, that's OK because the
1875 1810           * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1876 1811           */
1877 1812          mutex_enter(&db->db_mtx);
1878 1813  
1879 1814          if (txg <= spa_last_synced_txg(os->os_spa)) {
↓ open down ↓ 3 lines elided ↑ open up ↑
1883 1818                  mutex_exit(&db->db_mtx);
1884 1819                  return (SET_ERROR(EEXIST));
1885 1820          }
1886 1821  
1887 1822          if (txg <= spa_syncing_txg(os->os_spa)) {
1888 1823                  /*
1889 1824                   * This txg is currently syncing, so we can't mess with
1890 1825                   * the dirty record anymore; just write a new log block.
1891 1826                   */
1892 1827                  mutex_exit(&db->db_mtx);
1893      -                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
     1828 +                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb,
     1829 +                    &sc));
1894 1830          }
1895 1831  
1896 1832          dr = db->db_last_dirty;
1897 1833          while (dr && dr->dr_txg != txg)
1898 1834                  dr = dr->dr_next;
1899 1835  
1900 1836          if (dr == NULL) {
1901 1837                  /*
1902 1838                   * There's no dr for this dbuf, so it must have been freed.
1903 1839                   * There's no need to log writes to freed blocks, so we're done.
↓ open down ↓ 65 lines elided ↑ open up ↑
1969 1905  
1970 1906          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1971 1907          dsa->dsa_dr = dr;
1972 1908          dsa->dsa_done = done;
1973 1909          dsa->dsa_zgd = zgd;
1974 1910          dsa->dsa_tx = NULL;
1975 1911  
1976 1912          zio_nowait(arc_write(pio, os->os_spa, txg,
1977 1913              zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1978 1914              &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
1979      -            ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
     1915 +            ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb, &sc));
1980 1916  
1981 1917          return (0);
1982 1918  }
1983 1919  
1984 1920  int
1985 1921  dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1986 1922      dmu_tx_t *tx)
1987 1923  {
1988 1924          dnode_t *dn;
1989 1925          int err;
↓ open down ↓ 145 lines elided ↑ open up ↑
2135 2071                   * algorithm (see comment in zio_nop_write) and
2136 2072                   * compression is enabled.  We don't enable nopwrite if
2137 2073                   * dedup is enabled as the two features are mutually
2138 2074                   * exclusive.
2139 2075                   */
2140 2076                  nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2141 2077                      ZCHECKSUM_FLAG_NOPWRITE) &&
2142 2078                      compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2143 2079          }
2144 2080  
     2081 +        zp->zp_usesc = WP_GET_SPECIALCLASS(wp);
2145 2082          zp->zp_checksum = checksum;
2146 2083          zp->zp_compress = compress;
2147 2084          ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
2148 2085  
2149 2086          zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2150 2087          zp->zp_level = level;
2151 2088          zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2152 2089          zp->zp_dedup = dedup;
2153 2090          zp->zp_dedup_verify = dedup && dedup_verify;
     2091 +        zp->zp_metadata = ismd;
2154 2092          zp->zp_nopwrite = nopwrite;
     2093 +        zp->zp_zpl_meta_to_special = os->os_zpl_meta_to_special;
     2094 +        zp->zp_usewbc = (zp->zp_usesc &&
     2095 +            os->os_wbc_mode == ZFS_WBC_MODE_ON && !ismd);
     2096 +
     2097 +        /* explicitly control the number for copies for DDT */
     2098 +        if (DMU_OT_IS_DDT_META(type) &&
     2099 +            os->os_spa->spa_ddt_meta_copies > 0) {
     2100 +                zp->zp_copies =
     2101 +                    MIN(os->os_spa->spa_ddt_meta_copies,
     2102 +                    spa_max_replication(os->os_spa));
     2103 +        }
     2104 +
     2105 +        DTRACE_PROBE2(dmu_wp, boolean_t, zp->zp_metadata,
     2106 +            boolean_t, zp->zp_usesc);
2155 2107  }
2156 2108  
2157 2109  int
2158 2110  dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2159 2111  {
2160 2112          dnode_t *dn;
2161 2113          int err;
2162 2114  
2163 2115          /*
2164 2116           * Sync any current changes before
↓ open down ↓ 207 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX