big-one Wdiff usr/src/uts/common/fs/zfs/dmu.c

Print this page

usr/src/uts/common/fs/zfs/ddt.c

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/dmu.c
          +++ new/usr/src/uts/common/fs/zfs/dmu.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  24   24   */
  25   25  /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
  26   26  /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
  27   27  /* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
  28   28  
  29   29  #include <sys/dmu.h>
  30   30  #include <sys/dmu_impl.h>
  31   31  #include <sys/dmu_tx.h>
  32   32  #include <sys/dbuf.h>
  33   33  #include <sys/dnode.h>
  34   34  #include <sys/zfs_context.h>
  35   35  #include <sys/dmu_objset.h>
  36   36  #include <sys/dmu_traverse.h>
  37   37  #include <sys/dsl_dataset.h>

↓ open down ↓

37 lines elided

↑ open up ↑

  38   38  #include <sys/dsl_dir.h>
  39   39  #include <sys/dsl_pool.h>
  40   40  #include <sys/dsl_synctask.h>
  41   41  #include <sys/dsl_prop.h>
  42   42  #include <sys/dmu_zfetch.h>
  43   43  #include <sys/zfs_ioctl.h>
  44   44  #include <sys/zap.h>
  45   45  #include <sys/zio_checksum.h>
  46   46  #include <sys/zio_compress.h>
  47   47  #include <sys/sa.h>
       48 +#include <sys/spa_impl.h>
  48   49  #include <sys/zfeature.h>
  49   50  #include <sys/abd.h>
  50   51  #ifdef _KERNEL
  51   52  #include <sys/vmsystm.h>
  52   53  #include <sys/zfs_znode.h>
       54 +#include <sys/zfs_vfsops.h>
  53   55  #endif
       56 +#include <sys/special.h>
  54   57  
  55   58  /*
  56   59   * Enable/disable nopwrite feature.
  57   60   */
  58   61  int zfs_nopwrite_enabled = 1;
  59   62  
  60   63  /*
  61   64   * Tunable to control percentage of dirtied blocks from frees in one TXG.
  62   65   * After this threshold is crossed, additional dirty blocks from frees
  63   66   * wait until the next TXG.
  64   67   * A value of zero will disable this throttle.
  65   68   */
  66   69  uint32_t zfs_per_txg_dirty_frees_percent = 30;
  67   70  
  68      -/*
  69      - * This can be used for testing, to ensure that certain actions happen
  70      - * while in the middle of a remap (which might otherwise complete too
  71      - * quickly).
  72      - */
  73      -int zfs_object_remap_one_indirect_delay_ticks = 0;
  74      -
  75   71  const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
  76      -        {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
  77      -        {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
  78      -        {       DMU_BSWAP_UINT64,       TRUE,   "object array"          },
  79      -        {       DMU_BSWAP_UINT8,        TRUE,   "packed nvlist"         },
  80      -        {       DMU_BSWAP_UINT64,       TRUE,   "packed nvlist size"    },
  81      -        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj"                 },
  82      -        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj header"          },
  83      -        {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map header"  },
  84      -        {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map"         },
  85      -        {       DMU_BSWAP_UINT64,       TRUE,   "ZIL intent log"        },
  86      -        {       DMU_BSWAP_DNODE,        TRUE,   "DMU dnode"             },
  87      -        {       DMU_BSWAP_OBJSET,       TRUE,   "DMU objset"            },
  88      -        {       DMU_BSWAP_UINT64,       TRUE,   "DSL directory"         },
  89      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL directory child map"},
  90      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset snap map"  },
  91      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL props"             },
  92      -        {       DMU_BSWAP_UINT64,       TRUE,   "DSL dataset"           },
  93      -        {       DMU_BSWAP_ZNODE,        TRUE,   "ZFS znode"             },
  94      -        {       DMU_BSWAP_OLDACL,       TRUE,   "ZFS V0 ACL"            },
  95      -        {       DMU_BSWAP_UINT8,        FALSE,  "ZFS plain file"        },
  96      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS directory"         },
  97      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS master node"       },
  98      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS delete queue"      },
  99      -        {       DMU_BSWAP_UINT8,        FALSE,  "zvol object"           },
 100      -        {       DMU_BSWAP_ZAP,          TRUE,   "zvol prop"             },
 101      -        {       DMU_BSWAP_UINT8,        FALSE,  "other uint8[]"         },
 102      -        {       DMU_BSWAP_UINT64,       FALSE,  "other uint64[]"        },
 103      -        {       DMU_BSWAP_ZAP,          TRUE,   "other ZAP"             },
 104      -        {       DMU_BSWAP_ZAP,          TRUE,   "persistent error log"  },
 105      -        {       DMU_BSWAP_UINT8,        TRUE,   "SPA history"           },
 106      -        {       DMU_BSWAP_UINT64,       TRUE,   "SPA history offsets"   },
 107      -        {       DMU_BSWAP_ZAP,          TRUE,   "Pool properties"       },
 108      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL permissions"       },
 109      -        {       DMU_BSWAP_ACL,          TRUE,   "ZFS ACL"               },
 110      -        {       DMU_BSWAP_UINT8,        TRUE,   "ZFS SYSACL"            },
 111      -        {       DMU_BSWAP_UINT8,        TRUE,   "FUID table"            },
 112      -        {       DMU_BSWAP_UINT64,       TRUE,   "FUID table size"       },
 113      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset next clones"},
 114      -        {       DMU_BSWAP_ZAP,          TRUE,   "scan work queue"       },
 115      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group used"   },
 116      -        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group quota"  },
 117      -        {       DMU_BSWAP_ZAP,          TRUE,   "snapshot refcount tags"},
 118      -        {       DMU_BSWAP_ZAP,          TRUE,   "DDT ZAP algorithm"     },
 119      -        {       DMU_BSWAP_ZAP,          TRUE,   "DDT statistics"        },
 120      -        {       DMU_BSWAP_UINT8,        TRUE,   "System attributes"     },
 121      -        {       DMU_BSWAP_ZAP,          TRUE,   "SA master node"        },
 122      -        {       DMU_BSWAP_ZAP,          TRUE,   "SA attr registration"  },
 123      -        {       DMU_BSWAP_ZAP,          TRUE,   "SA attr layouts"       },
 124      -        {       DMU_BSWAP_ZAP,          TRUE,   "scan translations"     },
 125      -        {       DMU_BSWAP_UINT8,        FALSE,  "deduplicated block"    },
 126      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL deadlist map"      },
 127      -        {       DMU_BSWAP_UINT64,       TRUE,   "DSL deadlist map hdr"  },
 128      -        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dir clones"        },
 129      -        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj subobj"          }
       72 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"                },
       73 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"           },
       74 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"               },
       75 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"              },
       76 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"         },
       77 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"                      },
       78 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"               },
       79 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"       },
       80 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"              },
       81 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"             },
       82 +        { DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"                  },
       83 +        { DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"                 },
       84 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"              },
       85 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"    },
       86 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"       },
       87 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"                  },
       88 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"                },
       89 +        { DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"                  },
       90 +        { DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"                 },
       91 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"             },
       92 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"              },
       93 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"            },
       94 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"           },
       95 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"                },
       96 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "zvol prop"                  },
       97 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"              },
       98 +        { DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"             },
       99 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"                  },
      100 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"       },
      101 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"                },
      102 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"        },
      103 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"            },
      104 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"            },
      105 +        { DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"                    },
      106 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"                 },
      107 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"                 },
      108 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"            },
      109 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"    },
      110 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"            },
      111 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"        },
      112 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"       },
      113 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"     },
      114 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"          },
      115 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"             },
      116 +        { DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"          },
      117 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"             },
      118 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"       },
      119 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"            },
      120 +        { DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"          },
      121 +        { DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"         },
      122 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"           },
      123 +        { DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"       },
      124 +        { DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"             },
      125 +        { DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"               }
 130  126  };
 131  127  
 132  128  const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 133  129          {       byteswap_uint8_array,   "uint8"         },
 134  130          {       byteswap_uint16_array,  "uint16"        },
 135  131          {       byteswap_uint32_array,  "uint32"        },
 136  132          {       byteswap_uint64_array,  "uint64"        },
 137  133          {       zap_byteswap,           "zap"           },
 138  134          {       dnode_buf_byteswap,     "dnode"         },
 139  135          {       dmu_objset_byteswap,    "objset"        },

 140  136          {       zfs_znode_byteswap,     "znode"         },
 141  137          {       zfs_oldacl_byteswap,    "oldacl"        },
 142  138          {       zfs_acl_byteswap,       "acl"           }
 143  139  };
 144  140  
 145  141  int
 146  142  dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
 147  143      void *tag, dmu_buf_t **dbp)
 148  144  {
 149  145          uint64_t blkid;
 150  146          dmu_buf_impl_t *db;
 151  147  
 152  148          blkid = dbuf_whichblock(dn, 0, offset);
 153  149          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 154  150          db = dbuf_hold(dn, blkid, tag);
 155  151          rw_exit(&dn->dn_struct_rwlock);
 156  152  
 157  153          if (db == NULL) {
 158  154                  *dbp = NULL;
 159  155                  return (SET_ERROR(EIO));
 160  156          }
 161  157  
 162  158          *dbp = &db->db;
 163  159          return (0);
 164  160  }
 165  161  int
 166  162  dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
 167  163      void *tag, dmu_buf_t **dbp)
 168  164  {
 169  165          dnode_t *dn;
 170  166          uint64_t blkid;
 171  167          dmu_buf_impl_t *db;
 172  168          int err;
 173  169  
 174  170          err = dnode_hold(os, object, FTAG, &dn);
 175  171          if (err)
 176  172                  return (err);
 177  173          blkid = dbuf_whichblock(dn, 0, offset);
 178  174          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 179  175          db = dbuf_hold(dn, blkid, tag);
 180  176          rw_exit(&dn->dn_struct_rwlock);
 181  177          dnode_rele(dn, FTAG);
 182  178  
 183  179          if (db == NULL) {
 184  180                  *dbp = NULL;
 185  181                  return (SET_ERROR(EIO));
 186  182          }
 187  183  
 188  184          *dbp = &db->db;
 189  185          return (err);
 190  186  }
 191  187  
 192  188  int
 193  189  dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
 194  190      void *tag, dmu_buf_t **dbp, int flags)
 195  191  {
 196  192          int err;
 197  193          int db_flags = DB_RF_CANFAIL;
 198  194  
 199  195          if (flags & DMU_READ_NO_PREFETCH)
 200  196                  db_flags |= DB_RF_NOPREFETCH;
 201  197  
 202  198          err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 203  199          if (err == 0) {
 204  200                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 205  201                  err = dbuf_read(db, NULL, db_flags);
 206  202                  if (err != 0) {
 207  203                          dbuf_rele(db, tag);
 208  204                          *dbp = NULL;
 209  205                  }
 210  206          }
 211  207  
 212  208          return (err);
 213  209  }
 214  210  
 215  211  int
 216  212  dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
 217  213      void *tag, dmu_buf_t **dbp, int flags)
 218  214  {
 219  215          int err;
 220  216          int db_flags = DB_RF_CANFAIL;
 221  217  
 222  218          if (flags & DMU_READ_NO_PREFETCH)
 223  219                  db_flags |= DB_RF_NOPREFETCH;
 224  220  
 225  221          err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 226  222          if (err == 0) {
 227  223                  dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 228  224                  err = dbuf_read(db, NULL, db_flags);
 229  225                  if (err != 0) {
 230  226                          dbuf_rele(db, tag);
 231  227                          *dbp = NULL;
 232  228                  }
 233  229          }
 234  230  
 235  231          return (err);
 236  232  }
 237  233  
 238  234  int
 239  235  dmu_bonus_max(void)
 240  236  {
 241  237          return (DN_MAX_BONUSLEN);
 242  238  }
 243  239  
 244  240  int
 245  241  dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 246  242  {
 247  243          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 248  244          dnode_t *dn;
 249  245          int error;
 250  246  
 251  247          DB_DNODE_ENTER(db);
 252  248          dn = DB_DNODE(db);
 253  249  
 254  250          if (dn->dn_bonus != db) {
 255  251                  error = SET_ERROR(EINVAL);
 256  252          } else if (newsize < 0 || newsize > db_fake->db_size) {
 257  253                  error = SET_ERROR(EINVAL);
 258  254          } else {
 259  255                  dnode_setbonuslen(dn, newsize, tx);
 260  256                  error = 0;
 261  257          }
 262  258  
 263  259          DB_DNODE_EXIT(db);
 264  260          return (error);
 265  261  }
 266  262  
 267  263  int
 268  264  dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 269  265  {
 270  266          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 271  267          dnode_t *dn;
 272  268          int error;
 273  269  
 274  270          DB_DNODE_ENTER(db);
 275  271          dn = DB_DNODE(db);
 276  272  
 277  273          if (!DMU_OT_IS_VALID(type)) {
 278  274                  error = SET_ERROR(EINVAL);
 279  275          } else if (dn->dn_bonus != db) {
 280  276                  error = SET_ERROR(EINVAL);
 281  277          } else {
 282  278                  dnode_setbonus_type(dn, type, tx);
 283  279                  error = 0;
 284  280          }
 285  281  
 286  282          DB_DNODE_EXIT(db);
 287  283          return (error);
 288  284  }
 289  285  
 290  286  dmu_object_type_t
 291  287  dmu_get_bonustype(dmu_buf_t *db_fake)
 292  288  {
 293  289          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 294  290          dnode_t *dn;
 295  291          dmu_object_type_t type;
 296  292  
 297  293          DB_DNODE_ENTER(db);
 298  294          dn = DB_DNODE(db);
 299  295          type = dn->dn_bonustype;
 300  296          DB_DNODE_EXIT(db);
 301  297  
 302  298          return (type);
 303  299  }
 304  300  
 305  301  int
 306  302  dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 307  303  {
 308  304          dnode_t *dn;
 309  305          int error;
 310  306  
 311  307          error = dnode_hold(os, object, FTAG, &dn);
 312  308          dbuf_rm_spill(dn, tx);
 313  309          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 314  310          dnode_rm_spill(dn, tx);
 315  311          rw_exit(&dn->dn_struct_rwlock);
 316  312          dnode_rele(dn, FTAG);
 317  313          return (error);
 318  314  }
 319  315  
 320  316  /*
 321  317   * returns ENOENT, EIO, or 0.
 322  318   */
 323  319  int
 324  320  dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 325  321  {
 326  322          dnode_t *dn;
 327  323          dmu_buf_impl_t *db;
 328  324          int error;
 329  325  
 330  326          error = dnode_hold(os, object, FTAG, &dn);
 331  327          if (error)
 332  328                  return (error);
 333  329  
 334  330          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 335  331          if (dn->dn_bonus == NULL) {
 336  332                  rw_exit(&dn->dn_struct_rwlock);
 337  333                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 338  334                  if (dn->dn_bonus == NULL)
 339  335                          dbuf_create_bonus(dn);
 340  336          }
 341  337          db = dn->dn_bonus;
 342  338  
 343  339          /* as long as the bonus buf is held, the dnode will be held */
 344  340          if (refcount_add(&db->db_holds, tag) == 1) {
 345  341                  VERIFY(dnode_add_ref(dn, db));
 346  342                  atomic_inc_32(&dn->dn_dbufs_count);
 347  343          }
 348  344  
 349  345          /*
 350  346           * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 351  347           * hold and incrementing the dbuf count to ensure that dnode_move() sees
 352  348           * a dnode hold for every dbuf.
 353  349           */
 354  350          rw_exit(&dn->dn_struct_rwlock);
 355  351  
 356  352          dnode_rele(dn, FTAG);
 357  353  
 358  354          VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
 359  355  
 360  356          *dbp = &db->db;
 361  357          return (0);
 362  358  }
 363  359  
 364  360  /*
 365  361   * returns ENOENT, EIO, or 0.
 366  362   *
 367  363   * This interface will allocate a blank spill dbuf when a spill blk
 368  364   * doesn't already exist on the dnode.
 369  365   *
 370  366   * if you only want to find an already existing spill db, then
 371  367   * dmu_spill_hold_existing() should be used.
 372  368   */
 373  369  int
 374  370  dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 375  371  {
 376  372          dmu_buf_impl_t *db = NULL;
 377  373          int err;
 378  374  
 379  375          if ((flags & DB_RF_HAVESTRUCT) == 0)
 380  376                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 381  377  
 382  378          db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 383  379  
 384  380          if ((flags & DB_RF_HAVESTRUCT) == 0)
 385  381                  rw_exit(&dn->dn_struct_rwlock);
 386  382  
 387  383          ASSERT(db != NULL);
 388  384          err = dbuf_read(db, NULL, flags);
 389  385          if (err == 0)
 390  386                  *dbp = &db->db;
 391  387          else
 392  388                  dbuf_rele(db, tag);
 393  389          return (err);
 394  390  }
 395  391  
 396  392  int
 397  393  dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 398  394  {
 399  395          dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 400  396          dnode_t *dn;
 401  397          int err;
 402  398  
 403  399          DB_DNODE_ENTER(db);
 404  400          dn = DB_DNODE(db);
 405  401  
 406  402          if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 407  403                  err = SET_ERROR(EINVAL);
 408  404          } else {
 409  405                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 410  406  
 411  407                  if (!dn->dn_have_spill) {
 412  408                          err = SET_ERROR(ENOENT);
 413  409                  } else {
 414  410                          err = dmu_spill_hold_by_dnode(dn,
 415  411                              DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 416  412                  }
 417  413  
 418  414                  rw_exit(&dn->dn_struct_rwlock);
 419  415          }
 420  416  
 421  417          DB_DNODE_EXIT(db);
 422  418          return (err);
 423  419  }
 424  420  
 425  421  int
 426  422  dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 427  423  {
 428  424          dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 429  425          dnode_t *dn;
 430  426          int err;
 431  427  
 432  428          DB_DNODE_ENTER(db);
 433  429          dn = DB_DNODE(db);
 434  430          err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 435  431          DB_DNODE_EXIT(db);
 436  432  
 437  433          return (err);
 438  434  }
 439  435  
 440  436  /*
 441  437   * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 442  438   * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 443  439   * and can induce severe lock contention when writing to several files
 444  440   * whose dnodes are in the same block.
 445  441   */
 446  442  static int
 447  443  dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 448  444      boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 449  445  {
 450  446          dmu_buf_t **dbp;
 451  447          uint64_t blkid, nblks, i;
 452  448          uint32_t dbuf_flags;
 453  449          int err;
 454  450          zio_t *zio;
 455  451  
 456  452          ASSERT(length <= DMU_MAX_ACCESS);
 457  453  
 458  454          /*
 459  455           * Note: We directly notify the prefetch code of this read, so that
 460  456           * we can tell it about the multi-block read.  dbuf_read() only knows
 461  457           * about the one block it is accessing.
 462  458           */
 463  459          dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 464  460              DB_RF_NOPREFETCH;
 465  461  
 466  462          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 467  463          if (dn->dn_datablkshift) {
 468  464                  int blkshift = dn->dn_datablkshift;
 469  465                  nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 470  466                      P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 471  467          } else {
 472  468                  if (offset + length > dn->dn_datablksz) {
 473  469                          zfs_panic_recover("zfs: accessing past end of object "
 474  470                              "%llx/%llx (size=%u access=%llu+%llu)",
 475  471                              (longlong_t)dn->dn_objset->
 476  472                              os_dsl_dataset->ds_object,
 477  473                              (longlong_t)dn->dn_object, dn->dn_datablksz,
 478  474                              (longlong_t)offset, (longlong_t)length);
 479  475                          rw_exit(&dn->dn_struct_rwlock);
 480  476                          return (SET_ERROR(EIO));
 481  477                  }
 482  478                  nblks = 1;
 483  479          }
 484  480          dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 485  481  
 486  482          zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 487  483          blkid = dbuf_whichblock(dn, 0, offset);
 488  484          for (i = 0; i < nblks; i++) {
 489  485                  dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 490  486                  if (db == NULL) {
 491  487                          rw_exit(&dn->dn_struct_rwlock);
 492  488                          dmu_buf_rele_array(dbp, nblks, tag);
 493  489                          zio_nowait(zio);
 494  490                          return (SET_ERROR(EIO));
 495  491                  }
 496  492  
 497  493                  /* initiate async i/o */
 498  494                  if (read)
 499  495                          (void) dbuf_read(db, zio, dbuf_flags);
 500  496                  dbp[i] = &db->db;
 501  497          }
 502  498  
 503  499          if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
 504  500              DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
 505  501                  dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
 506  502                      read && DNODE_IS_CACHEABLE(dn));
 507  503          }
 508  504          rw_exit(&dn->dn_struct_rwlock);
 509  505  
 510  506          /* wait for async i/o */
 511  507          err = zio_wait(zio);
 512  508          if (err) {
 513  509                  dmu_buf_rele_array(dbp, nblks, tag);
 514  510                  return (err);
 515  511          }
 516  512  
 517  513          /* wait for other io to complete */
 518  514          if (read) {
 519  515                  for (i = 0; i < nblks; i++) {
 520  516                          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 521  517                          mutex_enter(&db->db_mtx);
 522  518                          while (db->db_state == DB_READ ||
 523  519                              db->db_state == DB_FILL)
 524  520                                  cv_wait(&db->db_changed, &db->db_mtx);
 525  521                          if (db->db_state == DB_UNCACHED)
 526  522                                  err = SET_ERROR(EIO);
 527  523                          mutex_exit(&db->db_mtx);
 528  524                          if (err) {
 529  525                                  dmu_buf_rele_array(dbp, nblks, tag);
 530  526                                  return (err);
 531  527                          }
 532  528                  }
 533  529          }
 534  530  
 535  531          *numbufsp = nblks;
 536  532          *dbpp = dbp;
 537  533          return (0);
 538  534  }
 539  535  
 540  536  static int
 541  537  dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 542  538      uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 543  539  {
 544  540          dnode_t *dn;
 545  541          int err;
 546  542  
 547  543          err = dnode_hold(os, object, FTAG, &dn);
 548  544          if (err)
 549  545                  return (err);
 550  546  
 551  547          err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 552  548              numbufsp, dbpp, DMU_READ_PREFETCH);
 553  549  
 554  550          dnode_rele(dn, FTAG);
 555  551  
 556  552          return (err);
 557  553  }
 558  554  
 559  555  int
 560  556  dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
 561  557      uint64_t length, boolean_t read, void *tag, int *numbufsp,
 562  558      dmu_buf_t ***dbpp)
 563  559  {
 564  560          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 565  561          dnode_t *dn;
 566  562          int err;
 567  563  
 568  564          DB_DNODE_ENTER(db);
 569  565          dn = DB_DNODE(db);
 570  566          err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 571  567              numbufsp, dbpp, DMU_READ_PREFETCH);
 572  568          DB_DNODE_EXIT(db);
 573  569  
 574  570          return (err);
 575  571  }
 576  572  
 577  573  void
 578  574  dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 579  575  {
 580  576          int i;
 581  577          dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 582  578  
 583  579          if (numbufs == 0)
 584  580                  return;
 585  581  
 586  582          for (i = 0; i < numbufs; i++) {
 587  583                  if (dbp[i])
 588  584                          dbuf_rele(dbp[i], tag);
 589  585          }
 590  586  
 591  587          kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 592  588  }
 593  589  
 594  590  /*
 595  591   * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
 596  592   * indirect blocks prefeteched will be those that point to the blocks containing
 597  593   * the data starting at offset, and continuing to offset + len.
 598  594   *
 599  595   * Note that if the indirect blocks above the blocks being prefetched are not in
 600  596   * cache, they will be asychronously read in.
 601  597   */
 602  598  void
 603  599  dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
 604  600      uint64_t len, zio_priority_t pri)
 605  601  {
 606  602          dnode_t *dn;
 607  603          uint64_t blkid;
 608  604          int nblks, err;
 609  605  
 610  606          if (len == 0) {  /* they're interested in the bonus buffer */
 611  607                  dn = DMU_META_DNODE(os);
 612  608  
 613  609                  if (object == 0 || object >= DN_MAX_OBJECT)
 614  610                          return;
 615  611  
 616  612                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 617  613                  blkid = dbuf_whichblock(dn, level,
 618  614                      object * sizeof (dnode_phys_t));
 619  615                  dbuf_prefetch(dn, level, blkid, pri, 0);
 620  616                  rw_exit(&dn->dn_struct_rwlock);
 621  617                  return;
 622  618          }
 623  619  
 624  620          /*
 625  621           * XXX - Note, if the dnode for the requested object is not
 626  622           * already cached, we will do a *synchronous* read in the
 627  623           * dnode_hold() call.  The same is true for any indirects.
 628  624           */
 629  625          err = dnode_hold(os, object, FTAG, &dn);
 630  626          if (err != 0)
 631  627                  return;
 632  628  
 633  629          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 634  630          /*
 635  631           * offset + len - 1 is the last byte we want to prefetch for, and offset
 636  632           * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
 637  633           * last block we want to prefetch, and dbuf_whichblock(dn, level,
 638  634           * offset)  is the first.  Then the number we need to prefetch is the
 639  635           * last - first + 1.
 640  636           */
 641  637          if (level > 0 || dn->dn_datablkshift != 0) {
 642  638                  nblks = dbuf_whichblock(dn, level, offset + len - 1) -
 643  639                      dbuf_whichblock(dn, level, offset) + 1;
 644  640          } else {
 645  641                  nblks = (offset < dn->dn_datablksz);
 646  642          }
 647  643  
 648  644          if (nblks != 0) {
 649  645                  blkid = dbuf_whichblock(dn, level, offset);
 650  646                  for (int i = 0; i < nblks; i++)
 651  647                          dbuf_prefetch(dn, level, blkid + i, pri, 0);
 652  648          }
 653  649  
 654  650          rw_exit(&dn->dn_struct_rwlock);
 655  651  
 656  652          dnode_rele(dn, FTAG);
 657  653  }
 658  654  
 659  655  /*
 660  656   * Get the next "chunk" of file data to free.  We traverse the file from
 661  657   * the end so that the file gets shorter over time (if we crashes in the
 662  658   * middle, this will leave us in a better state).  We find allocated file
 663  659   * data by simply searching the allocated level 1 indirects.
 664  660   *
 665  661   * On input, *start should be the first offset that does not need to be
 666  662   * freed (e.g. "offset + length").  On return, *start will be the first
 667  663   * offset that should be freed.
 668  664   */
 669  665  static int
 670  666  get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 671  667  {
 672  668          uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 673  669          /* bytes of data covered by a level-1 indirect block */
 674  670          uint64_t iblkrange =
 675  671              dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 676  672  
 677  673          ASSERT3U(minimum, <=, *start);
 678  674  
 679  675          if (*start - minimum <= iblkrange * maxblks) {
 680  676                  *start = minimum;
 681  677                  return (0);
 682  678          }
 683  679          ASSERT(ISP2(iblkrange));
 684  680  
 685  681          for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
 686  682                  int err;
 687  683  
 688  684                  /*
 689  685                   * dnode_next_offset(BACKWARDS) will find an allocated L1
 690  686                   * indirect block at or before the input offset.  We must
 691  687                   * decrement *start so that it is at the end of the region
 692  688                   * to search.
 693  689                   */
 694  690                  (*start)--;
 695  691                  err = dnode_next_offset(dn,
 696  692                      DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 697  693  
 698  694                  /* if there are no indirect blocks before start, we are done */
 699  695                  if (err == ESRCH) {
 700  696                          *start = minimum;
 701  697                          break;
 702  698                  } else if (err != 0) {
 703  699                          return (err);
 704  700                  }

↓ open down ↓

565 lines elided

↑ open up ↑

 705  701  
 706  702                  /* set start to the beginning of this L1 indirect */
 707  703                  *start = P2ALIGN(*start, iblkrange);
 708  704          }
 709  705          if (*start < minimum)
 710  706                  *start = minimum;
 711  707          return (0);
 712  708  }
 713  709  
 714  710  /*
 715      - * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
 716      - * otherwise return false.
 717      - * Used below in dmu_free_long_range_impl() to enable abort when unmounting
      711 + * If this dnode is in the ZFS object set
      712 + * return true if vfs's unmounted flag is set or the
      713 + * zfsvfs is currently suspended, otherwise return false.
 718  714   */
 719  715  /*ARGSUSED*/
 720  716  static boolean_t
 721      -dmu_objset_zfs_unmounting(objset_t *os)
      717 +dmu_dnode_fs_unmounting_or_suspended(dnode_t *freeing_dn)
 722  718  {
 723  719  #ifdef _KERNEL
 724      -        if (dmu_objset_type(os) == DMU_OST_ZFS)
 725      -                return (zfs_get_vfs_flag_unmounted(os));
 726      -#endif
      720 +        boolean_t busy = B_FALSE;
      721 +        objset_t *os = freeing_dn->dn_objset;
      722 +        zfsvfs_t *zfsvfs;
      723 +
      724 +        if (dmu_objset_type(os) == DMU_OST_ZFS) {
      725 +                mutex_enter(&os->os_user_ptr_lock);
      726 +                zfsvfs = dmu_objset_get_user(os);
      727 +                if (zfsvfs != NULL && zfsvfs->z_vfs != NULL &&
      728 +                    ((zfsvfs->z_vfs->vfs_flag & VFS_UNMOUNTED) ||
      729 +                     zfsvfs->z_busy))
      730 +                        busy = B_TRUE;
      731 +                mutex_exit(&os->os_user_ptr_lock);
      732 +        }
      733 +
      734 +        return (busy);
      735 +#else
 727  736          return (B_FALSE);
      737 +#endif
 728  738  }
 729  739  
 730  740  static int
 731  741  dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 732  742      uint64_t length)
 733  743  {
 734  744          uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 735  745          int err;
 736  746          uint64_t dirty_frees_threshold;
 737  747          dsl_pool_t *dp = dmu_objset_pool(os);
 738  748  
 739  749          if (offset >= object_size)
 740  750                  return (0);
 741  751  
 742  752          if (zfs_per_txg_dirty_frees_percent <= 100)
 743  753                  dirty_frees_threshold =
 744  754                      zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 745  755          else
 746  756                  dirty_frees_threshold = zfs_dirty_data_max / 4;
 747  757  
      758 +        if (length == DMU_OBJECT_END && offset == 0)
      759 +                dnode_evict_dbufs(dn, 0);
      760 +
 748  761          if (length == DMU_OBJECT_END || offset + length > object_size)
 749  762                  length = object_size - offset;
 750  763  
      764 +        mutex_enter(&dp->dp_lock);
      765 +        dp->dp_long_freeing_total += length;
      766 +        mutex_exit(&dp->dp_lock);
      767 +
 751  768          while (length != 0) {
 752  769                  uint64_t chunk_end, chunk_begin, chunk_len;
 753  770                  uint64_t long_free_dirty_all_txgs = 0;
 754  771                  dmu_tx_t *tx;
 755  772  
 756      -                if (dmu_objset_zfs_unmounting(dn->dn_objset))
      773 +                if (dmu_dnode_fs_unmounting_or_suspended(dn)) {
      774 +                        mutex_enter(&dp->dp_lock);
      775 +                        dp->dp_long_freeing_total -= length;
      776 +                        mutex_exit(&dp->dp_lock);
      777 +
 757  778                          return (SET_ERROR(EINTR));
      779 +                }
 758  780  
 759  781                  chunk_end = chunk_begin = offset + length;
 760  782  
 761  783                  /* move chunk_begin backwards to the beginning of this chunk */
 762  784                  err = get_next_chunk(dn, &chunk_begin, offset);
 763  785                  if (err)
 764  786                          return (err);
 765  787                  ASSERT3U(chunk_begin, >=, offset);
 766  788                  ASSERT3U(chunk_begin, <=, chunk_end);
 767  789

 768  790                  chunk_len = chunk_end - chunk_begin;
 769  791  
 770  792                  mutex_enter(&dp->dp_lock);
 771  793                  for (int t = 0; t < TXG_SIZE; t++) {
 772  794                          long_free_dirty_all_txgs +=
 773  795                              dp->dp_long_free_dirty_pertxg[t];
 774  796                  }
 775  797                  mutex_exit(&dp->dp_lock);
 776  798  
 777  799                  /*
 778  800                   * To avoid filling up a TXG with just frees wait for
 779  801                   * the next TXG to open before freeing more chunks if
 780  802                   * we have reached the threshold of frees
 781  803                   */
 782  804                  if (dirty_frees_threshold != 0 &&
 783  805                      long_free_dirty_all_txgs >= dirty_frees_threshold) {
 784  806                          txg_wait_open(dp, 0);
 785  807                          continue;
 786  808                  }
 787  809  
 788  810                  tx = dmu_tx_create(os);

↓ open down ↓

21 lines elided

↑ open up ↑

 789  811                  dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 790  812  
 791  813                  /*
 792  814                   * Mark this transaction as typically resulting in a net
 793  815                   * reduction in space used.
 794  816                   */
 795  817                  dmu_tx_mark_netfree(tx);
 796  818                  err = dmu_tx_assign(tx, TXG_WAIT);
 797  819                  if (err) {
 798  820                          dmu_tx_abort(tx);
      821 +                        mutex_enter(&dp->dp_lock);
      822 +                        dp->dp_long_freeing_total -= length - chunk_len;
      823 +                        mutex_exit(&dp->dp_lock);
 799  824                          return (err);
 800  825                  }
 801  826  
 802  827                  mutex_enter(&dp->dp_lock);
 803  828                  dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
 804  829                      chunk_len;
 805  830                  mutex_exit(&dp->dp_lock);
 806  831                  DTRACE_PROBE3(free__long__range,
 807  832                      uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
 808  833                      uint64_t, dmu_tx_get_txg(tx));

 809  834                  dnode_free_range(dn, chunk_begin, chunk_len, tx);
 810  835                  dmu_tx_commit(tx);
 811  836  
 812  837                  length -= chunk_len;
 813  838          }
 814  839          return (0);
 815  840  }
 816  841  
 817  842  int
 818  843  dmu_free_long_range(objset_t *os, uint64_t object,
 819  844      uint64_t offset, uint64_t length)
 820  845  {
 821  846          dnode_t *dn;
 822  847          int err;
 823  848  
 824  849          err = dnode_hold(os, object, FTAG, &dn);
 825  850          if (err != 0)
 826  851                  return (err);
 827  852          err = dmu_free_long_range_impl(os, dn, offset, length);
 828  853  
 829  854          /*
 830  855           * It is important to zero out the maxblkid when freeing the entire
 831  856           * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 832  857           * will take the fast path, and (b) dnode_reallocate() can verify
 833  858           * that the entire file has been freed.
 834  859           */
 835  860          if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 836  861                  dn->dn_maxblkid = 0;
 837  862  
 838  863          dnode_rele(dn, FTAG);
 839  864          return (err);
 840  865  }
 841  866  
 842  867  int
 843  868  dmu_free_long_object(objset_t *os, uint64_t object)
 844  869  {
 845  870          dmu_tx_t *tx;
 846  871          int err;
 847  872  
 848  873          err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 849  874          if (err != 0)
 850  875                  return (err);
 851  876  
 852  877          tx = dmu_tx_create(os);
 853  878          dmu_tx_hold_bonus(tx, object);
 854  879          dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 855  880          dmu_tx_mark_netfree(tx);
 856  881          err = dmu_tx_assign(tx, TXG_WAIT);
 857  882          if (err == 0) {
 858  883                  err = dmu_object_free(os, object, tx);
 859  884                  dmu_tx_commit(tx);
 860  885          } else {
 861  886                  dmu_tx_abort(tx);
 862  887          }
 863  888  
 864  889          return (err);
 865  890  }
 866  891  
 867  892  int
 868  893  dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 869  894      uint64_t size, dmu_tx_t *tx)
 870  895  {
 871  896          dnode_t *dn;
 872  897          int err = dnode_hold(os, object, FTAG, &dn);
 873  898          if (err)
 874  899                  return (err);
 875  900          ASSERT(offset < UINT64_MAX);
 876  901          ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
 877  902          dnode_free_range(dn, offset, size, tx);
 878  903          dnode_rele(dn, FTAG);
 879  904          return (0);
 880  905  }
 881  906  
 882  907  static int
 883  908  dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
 884  909      void *buf, uint32_t flags)
 885  910  {
 886  911          dmu_buf_t **dbp;
 887  912          int numbufs, err = 0;
 888  913  
 889  914          /*
 890  915           * Deal with odd block sizes, where there can't be data past the first
 891  916           * block.  If we ever do the tail block optimization, we will need to
 892  917           * handle that here as well.
 893  918           */
 894  919          if (dn->dn_maxblkid == 0) {
 895  920                  int newsz = offset > dn->dn_datablksz ? 0 :
 896  921                      MIN(size, dn->dn_datablksz - offset);
 897  922                  bzero((char *)buf + newsz, size - newsz);
 898  923                  size = newsz;
 899  924          }
 900  925  
 901  926          while (size > 0) {
 902  927                  uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 903  928                  int i;
 904  929  
 905  930                  /*
 906  931                   * NB: we could do this block-at-a-time, but it's nice
 907  932                   * to be reading in parallel.
 908  933                   */
 909  934                  err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 910  935                      TRUE, FTAG, &numbufs, &dbp, flags);
 911  936                  if (err)
 912  937                          break;
 913  938  
 914  939                  for (i = 0; i < numbufs; i++) {
 915  940                          int tocpy;
 916  941                          int bufoff;
 917  942                          dmu_buf_t *db = dbp[i];
 918  943  
 919  944                          ASSERT(size > 0);
 920  945  
 921  946                          bufoff = offset - db->db_offset;
 922  947                          tocpy = (int)MIN(db->db_size - bufoff, size);
 923  948  
 924  949                          bcopy((char *)db->db_data + bufoff, buf, tocpy);
 925  950  
 926  951                          offset += tocpy;
 927  952                          size -= tocpy;
 928  953                          buf = (char *)buf + tocpy;
 929  954                  }
 930  955                  dmu_buf_rele_array(dbp, numbufs, FTAG);
 931  956          }
 932  957          return (err);
 933  958  }
 934  959  
 935  960  int
 936  961  dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 937  962      void *buf, uint32_t flags)
 938  963  {
 939  964          dnode_t *dn;
 940  965          int err;
 941  966  
 942  967          err = dnode_hold(os, object, FTAG, &dn);
 943  968          if (err != 0)
 944  969                  return (err);
 945  970  
 946  971          err = dmu_read_impl(dn, offset, size, buf, flags);
 947  972          dnode_rele(dn, FTAG);
 948  973          return (err);
 949  974  }
 950  975  
 951  976  int
 952  977  dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
 953  978      uint32_t flags)
 954  979  {
 955  980          return (dmu_read_impl(dn, offset, size, buf, flags));
 956  981  }
 957  982  
 958  983  static void
 959  984  dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
 960  985      const void *buf, dmu_tx_t *tx)
 961  986  {
 962  987          int i;
 963  988  
 964  989          for (i = 0; i < numbufs; i++) {
 965  990                  int tocpy;
 966  991                  int bufoff;
 967  992                  dmu_buf_t *db = dbp[i];
 968  993  
 969  994                  ASSERT(size > 0);
 970  995  
 971  996                  bufoff = offset - db->db_offset;
 972  997                  tocpy = (int)MIN(db->db_size - bufoff, size);
 973  998  
 974  999                  ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 975 1000  
 976 1001                  if (tocpy == db->db_size)
 977 1002                          dmu_buf_will_fill(db, tx);
 978 1003                  else
 979 1004                          dmu_buf_will_dirty(db, tx);
 980 1005  
 981 1006                  bcopy(buf, (char *)db->db_data + bufoff, tocpy);
 982 1007  
 983 1008                  if (tocpy == db->db_size)
 984 1009                          dmu_buf_fill_done(db, tx);
 985 1010  
 986 1011                  offset += tocpy;
 987 1012                  size -= tocpy;
 988 1013                  buf = (char *)buf + tocpy;
 989 1014          }
 990 1015  }
 991 1016  
 992 1017  void
 993 1018  dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 994 1019      const void *buf, dmu_tx_t *tx)
 995 1020  {
 996 1021          dmu_buf_t **dbp;
 997 1022          int numbufs;
 998 1023  
 999 1024          if (size == 0)
1000 1025                  return;
1001 1026  
1002 1027          VERIFY0(dmu_buf_hold_array(os, object, offset, size,
1003 1028              FALSE, FTAG, &numbufs, &dbp));
1004 1029          dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1005 1030          dmu_buf_rele_array(dbp, numbufs, FTAG);
1006 1031  }
1007 1032  
1008 1033  void
1009 1034  dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
1010 1035      const void *buf, dmu_tx_t *tx)
1011 1036  {
1012 1037          dmu_buf_t **dbp;
1013 1038          int numbufs;

↓ open down ↓

205 lines elided

↑ open up ↑

1014 1039  
1015 1040          if (size == 0)
1016 1041                  return;
1017 1042  
1018 1043          VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
1019 1044              FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
1020 1045          dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
1021 1046          dmu_buf_rele_array(dbp, numbufs, FTAG);
1022 1047  }
1023 1048  
1024      -static int
1025      -dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
1026      -    uint64_t last_removal_txg, uint64_t offset)
1027      -{
1028      -        uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
1029      -        int err = 0;
1030      -
1031      -        rw_enter(&dn->dn_struct_rwlock, RW_READER);
1032      -        dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
1033      -        ASSERT3P(dbuf, !=, NULL);
1034      -
1035      -        /*
1036      -         * If the block hasn't been written yet, this default will ensure
1037      -         * we don't try to remap it.
1038      -         */
1039      -        uint64_t birth = UINT64_MAX;
1040      -        ASSERT3U(last_removal_txg, !=, UINT64_MAX);
1041      -        if (dbuf->db_blkptr != NULL)
1042      -                birth = dbuf->db_blkptr->blk_birth;
1043      -        rw_exit(&dn->dn_struct_rwlock);
1044      -
1045      -        /*
1046      -         * If this L1 was already written after the last removal, then we've
1047      -         * already tried to remap it.
1048      -         */
1049      -        if (birth <= last_removal_txg &&
1050      -            dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
1051      -            dbuf_can_remap(dbuf)) {
1052      -                dmu_tx_t *tx = dmu_tx_create(os);
1053      -                dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
1054      -                err = dmu_tx_assign(tx, TXG_WAIT);
1055      -                if (err == 0) {
1056      -                        (void) dbuf_dirty(dbuf, tx);
1057      -                        dmu_tx_commit(tx);
1058      -                } else {
1059      -                        dmu_tx_abort(tx);
1060      -                }
1061      -        }
1062      -
1063      -        dbuf_rele(dbuf, FTAG);
1064      -
1065      -        delay(zfs_object_remap_one_indirect_delay_ticks);
1066      -
1067      -        return (err);
1068      -}
1069      -
1070      -/*
1071      - * Remap all blockpointers in the object, if possible, so that they reference
1072      - * only concrete vdevs.
1073      - *
1074      - * To do this, iterate over the L0 blockpointers and remap any that reference
1075      - * an indirect vdev. Note that we only examine L0 blockpointers; since we
1076      - * cannot guarantee that we can remap all blockpointer anyways (due to split
1077      - * blocks), we do not want to make the code unnecessarily complicated to
1078      - * catch the unlikely case that there is an L1 block on an indirect vdev that
1079      - * contains no indirect blockpointers.
1080      - */
1081      -int
1082      -dmu_object_remap_indirects(objset_t *os, uint64_t object,
1083      -    uint64_t last_removal_txg)
1084      -{
1085      -        uint64_t offset, l1span;
1086      -        int err;
1087      -        dnode_t *dn;
1088      -
1089      -        err = dnode_hold(os, object, FTAG, &dn);
1090      -        if (err != 0) {
1091      -                return (err);
1092      -        }
1093      -
1094      -        if (dn->dn_nlevels <= 1) {
1095      -                if (issig(JUSTLOOKING) && issig(FORREAL)) {
1096      -                        err = SET_ERROR(EINTR);
1097      -                }
1098      -
1099      -                /*
1100      -                 * If the dnode has no indirect blocks, we cannot dirty them.
1101      -                 * We still want to remap the blkptr(s) in the dnode if
1102      -                 * appropriate, so mark it as dirty.
1103      -                 */
1104      -                if (err == 0 && dnode_needs_remap(dn)) {
1105      -                        dmu_tx_t *tx = dmu_tx_create(os);
1106      -                        dmu_tx_hold_bonus(tx, dn->dn_object);
1107      -                        if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
1108      -                                dnode_setdirty(dn, tx);
1109      -                                dmu_tx_commit(tx);
1110      -                        } else {
1111      -                                dmu_tx_abort(tx);
1112      -                        }
1113      -                }
1114      -
1115      -                dnode_rele(dn, FTAG);
1116      -                return (err);
1117      -        }
1118      -
1119      -        offset = 0;
1120      -        l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
1121      -            dn->dn_datablkshift);
1122      -        /*
1123      -         * Find the next L1 indirect that is not a hole.
1124      -         */
1125      -        while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
1126      -                if (issig(JUSTLOOKING) && issig(FORREAL)) {
1127      -                        err = SET_ERROR(EINTR);
1128      -                        break;
1129      -                }
1130      -                if ((err = dmu_object_remap_one_indirect(os, dn,
1131      -                    last_removal_txg, offset)) != 0) {
1132      -                        break;
1133      -                }
1134      -                offset += l1span;
1135      -        }
1136      -
1137      -        dnode_rele(dn, FTAG);
1138      -        return (err);
1139      -}
1140      -
1141 1049  void
1142 1050  dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1143 1051      dmu_tx_t *tx)
1144 1052  {
1145 1053          dmu_buf_t **dbp;
1146 1054          int numbufs, i;
1147 1055  
1148 1056          if (size == 0)
1149 1057                  return;
1150 1058

1151 1059          VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
1152 1060              FALSE, FTAG, &numbufs, &dbp));
1153 1061  
1154 1062          for (i = 0; i < numbufs; i++) {
1155 1063                  dmu_buf_t *db = dbp[i];
1156 1064  
1157 1065                  dmu_buf_will_not_fill(db, tx);
1158 1066          }
1159 1067          dmu_buf_rele_array(dbp, numbufs, FTAG);
1160 1068  }
1161 1069  
1162 1070  void
1163 1071  dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
1164 1072      void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
1165 1073      int compressed_size, int byteorder, dmu_tx_t *tx)
1166 1074  {
1167 1075          dmu_buf_t *db;
1168 1076  
1169 1077          ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
1170 1078          ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
1171 1079          VERIFY0(dmu_buf_hold_noread(os, object, offset,
1172 1080              FTAG, &db));
1173 1081  
1174 1082          dmu_buf_write_embedded(db,
1175 1083              data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
1176 1084              uncompressed_size, compressed_size, byteorder, tx);
1177 1085  
1178 1086          dmu_buf_rele(db, FTAG);
1179 1087  }
1180 1088  
1181 1089  /*
1182 1090   * DMU support for xuio
1183 1091   */
1184 1092  kstat_t *xuio_ksp = NULL;
1185 1093  
1186 1094  int
1187 1095  dmu_xuio_init(xuio_t *xuio, int nblk)
1188 1096  {
1189 1097          dmu_xuio_t *priv;
1190 1098          uio_t *uio = &xuio->xu_uio;
1191 1099  
1192 1100          uio->uio_iovcnt = nblk;
1193 1101          uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
1194 1102  
1195 1103          priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
1196 1104          priv->cnt = nblk;
1197 1105          priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
1198 1106          priv->iovp = uio->uio_iov;
1199 1107          XUIO_XUZC_PRIV(xuio) = priv;
1200 1108  
1201 1109          if (XUIO_XUZC_RW(xuio) == UIO_READ)
1202 1110                  XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
1203 1111          else
1204 1112                  XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
1205 1113  
1206 1114          return (0);
1207 1115  }
1208 1116  
1209 1117  void
1210 1118  dmu_xuio_fini(xuio_t *xuio)
1211 1119  {
1212 1120          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1213 1121          int nblk = priv->cnt;
1214 1122  
1215 1123          kmem_free(priv->iovp, nblk * sizeof (iovec_t));
1216 1124          kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
1217 1125          kmem_free(priv, sizeof (dmu_xuio_t));
1218 1126  
1219 1127          if (XUIO_XUZC_RW(xuio) == UIO_READ)
1220 1128                  XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
1221 1129          else
1222 1130                  XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
1223 1131  }
1224 1132  
1225 1133  /*
1226 1134   * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
1227 1135   * and increase priv->next by 1.
1228 1136   */
1229 1137  int
1230 1138  dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
1231 1139  {
1232 1140          struct iovec *iov;
1233 1141          uio_t *uio = &xuio->xu_uio;
1234 1142          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1235 1143          int i = priv->next++;
1236 1144  
1237 1145          ASSERT(i < priv->cnt);
1238 1146          ASSERT(off + n <= arc_buf_lsize(abuf));
1239 1147          iov = uio->uio_iov + i;
1240 1148          iov->iov_base = (char *)abuf->b_data + off;
1241 1149          iov->iov_len = n;
1242 1150          priv->bufs[i] = abuf;
1243 1151          return (0);
1244 1152  }
1245 1153  
1246 1154  int
1247 1155  dmu_xuio_cnt(xuio_t *xuio)
1248 1156  {
1249 1157          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1250 1158          return (priv->cnt);
1251 1159  }
1252 1160  
1253 1161  arc_buf_t *
1254 1162  dmu_xuio_arcbuf(xuio_t *xuio, int i)
1255 1163  {
1256 1164          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1257 1165  
1258 1166          ASSERT(i < priv->cnt);
1259 1167          return (priv->bufs[i]);
1260 1168  }
1261 1169  
1262 1170  void
1263 1171  dmu_xuio_clear(xuio_t *xuio, int i)
1264 1172  {
1265 1173          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
1266 1174  
1267 1175          ASSERT(i < priv->cnt);
1268 1176          priv->bufs[i] = NULL;
1269 1177  }
1270 1178  
1271 1179  static void
1272 1180  xuio_stat_init(void)
1273 1181  {
1274 1182          xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
1275 1183              KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
1276 1184              KSTAT_FLAG_VIRTUAL);
1277 1185          if (xuio_ksp != NULL) {
1278 1186                  xuio_ksp->ks_data = &xuio_stats;
1279 1187                  kstat_install(xuio_ksp);
1280 1188          }
1281 1189  }
1282 1190  
1283 1191  static void
1284 1192  xuio_stat_fini(void)
1285 1193  {
1286 1194          if (xuio_ksp != NULL) {
1287 1195                  kstat_delete(xuio_ksp);
1288 1196                  xuio_ksp = NULL;
1289 1197          }
1290 1198  }
1291 1199  
1292 1200  void
1293 1201  xuio_stat_wbuf_copied(void)
1294 1202  {
1295 1203          XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1296 1204  }
1297 1205  
1298 1206  void
1299 1207  xuio_stat_wbuf_nocopy(void)
1300 1208  {
1301 1209          XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
1302 1210  }
1303 1211  
1304 1212  #ifdef _KERNEL
1305 1213  static int
1306 1214  dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
1307 1215  {
1308 1216          dmu_buf_t **dbp;
1309 1217          int numbufs, i, err;
1310 1218          xuio_t *xuio = NULL;
1311 1219  
1312 1220          /*
1313 1221           * NB: we could do this block-at-a-time, but it's nice
1314 1222           * to be reading in parallel.
1315 1223           */
1316 1224          err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1317 1225              TRUE, FTAG, &numbufs, &dbp, 0);
1318 1226          if (err)
1319 1227                  return (err);
1320 1228  
1321 1229          if (uio->uio_extflg == UIO_XUIO)
1322 1230                  xuio = (xuio_t *)uio;
1323 1231  
1324 1232          for (i = 0; i < numbufs; i++) {
1325 1233                  int tocpy;
1326 1234                  int bufoff;
1327 1235                  dmu_buf_t *db = dbp[i];
1328 1236  
1329 1237                  ASSERT(size > 0);
1330 1238  
1331 1239                  bufoff = uio->uio_loffset - db->db_offset;
1332 1240                  tocpy = (int)MIN(db->db_size - bufoff, size);
1333 1241  
1334 1242                  if (xuio) {
1335 1243                          dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
1336 1244                          arc_buf_t *dbuf_abuf = dbi->db_buf;
1337 1245                          arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
1338 1246                          err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
1339 1247                          if (!err) {
1340 1248                                  uio->uio_resid -= tocpy;
1341 1249                                  uio->uio_loffset += tocpy;
1342 1250                          }
1343 1251  
1344 1252                          if (abuf == dbuf_abuf)
1345 1253                                  XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
1346 1254                          else
1347 1255                                  XUIOSTAT_BUMP(xuiostat_rbuf_copied);
1348 1256                  } else {
1349 1257                          err = uiomove((char *)db->db_data + bufoff, tocpy,
1350 1258                              UIO_READ, uio);
1351 1259                  }
1352 1260                  if (err)
1353 1261                          break;
1354 1262  
1355 1263                  size -= tocpy;
1356 1264          }
1357 1265          dmu_buf_rele_array(dbp, numbufs, FTAG);
1358 1266  
1359 1267          return (err);
1360 1268  }
1361 1269  
1362 1270  /*
1363 1271   * Read 'size' bytes into the uio buffer.
1364 1272   * From object zdb->db_object.
1365 1273   * Starting at offset uio->uio_loffset.
1366 1274   *
1367 1275   * If the caller already has a dbuf in the target object
1368 1276   * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
1369 1277   * because we don't have to find the dnode_t for the object.
1370 1278   */
1371 1279  int
1372 1280  dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
1373 1281  {
1374 1282          dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1375 1283          dnode_t *dn;
1376 1284          int err;
1377 1285  
1378 1286          if (size == 0)
1379 1287                  return (0);
1380 1288  
1381 1289          DB_DNODE_ENTER(db);
1382 1290          dn = DB_DNODE(db);
1383 1291          err = dmu_read_uio_dnode(dn, uio, size);
1384 1292          DB_DNODE_EXIT(db);
1385 1293  
1386 1294          return (err);
1387 1295  }
1388 1296  
1389 1297  /*
1390 1298   * Read 'size' bytes into the uio buffer.
1391 1299   * From the specified object
1392 1300   * Starting at offset uio->uio_loffset.
1393 1301   */
1394 1302  int
1395 1303  dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
1396 1304  {
1397 1305          dnode_t *dn;
1398 1306          int err;
1399 1307  
1400 1308          if (size == 0)
1401 1309                  return (0);
1402 1310  
1403 1311          err = dnode_hold(os, object, FTAG, &dn);
1404 1312          if (err)
1405 1313                  return (err);
1406 1314  
1407 1315          err = dmu_read_uio_dnode(dn, uio, size);
1408 1316  
1409 1317          dnode_rele(dn, FTAG);
1410 1318  
1411 1319          return (err);
1412 1320  }
1413 1321  
1414 1322  static int
1415 1323  dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1416 1324  {
1417 1325          dmu_buf_t **dbp;
1418 1326          int numbufs;
1419 1327          int err = 0;
1420 1328          int i;
1421 1329  
1422 1330          err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1423 1331              FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1424 1332          if (err)
1425 1333                  return (err);
1426 1334  
1427 1335          for (i = 0; i < numbufs; i++) {
1428 1336                  int tocpy;
1429 1337                  int bufoff;
1430 1338                  dmu_buf_t *db = dbp[i];
1431 1339  
1432 1340                  ASSERT(size > 0);
1433 1341  
1434 1342                  bufoff = uio->uio_loffset - db->db_offset;
1435 1343                  tocpy = (int)MIN(db->db_size - bufoff, size);
1436 1344  
1437 1345                  ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1438 1346  
1439 1347                  if (tocpy == db->db_size)
1440 1348                          dmu_buf_will_fill(db, tx);
1441 1349                  else
1442 1350                          dmu_buf_will_dirty(db, tx);
1443 1351  
1444 1352                  /*
1445 1353                   * XXX uiomove could block forever (eg. nfs-backed
1446 1354                   * pages).  There needs to be a uiolockdown() function
1447 1355                   * to lock the pages in memory, so that uiomove won't
1448 1356                   * block.
1449 1357                   */
1450 1358                  err = uiomove((char *)db->db_data + bufoff, tocpy,
1451 1359                      UIO_WRITE, uio);
1452 1360  
1453 1361                  if (tocpy == db->db_size)
1454 1362                          dmu_buf_fill_done(db, tx);
1455 1363  
1456 1364                  if (err)
1457 1365                          break;
1458 1366  
1459 1367                  size -= tocpy;
1460 1368          }
1461 1369  
1462 1370          dmu_buf_rele_array(dbp, numbufs, FTAG);
1463 1371          return (err);
1464 1372  }
1465 1373  
1466 1374  /*
1467 1375   * Write 'size' bytes from the uio buffer.
1468 1376   * To object zdb->db_object.
1469 1377   * Starting at offset uio->uio_loffset.
1470 1378   *
1471 1379   * If the caller already has a dbuf in the target object
1472 1380   * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
1473 1381   * because we don't have to find the dnode_t for the object.
1474 1382   */
1475 1383  int
1476 1384  dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1477 1385      dmu_tx_t *tx)
1478 1386  {
1479 1387          dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1480 1388          dnode_t *dn;
1481 1389          int err;
1482 1390  
1483 1391          if (size == 0)
1484 1392                  return (0);
1485 1393  
1486 1394          DB_DNODE_ENTER(db);
1487 1395          dn = DB_DNODE(db);
1488 1396          err = dmu_write_uio_dnode(dn, uio, size, tx);
1489 1397          DB_DNODE_EXIT(db);
1490 1398  
1491 1399          return (err);
1492 1400  }
1493 1401  
1494 1402  /*
1495 1403   * Write 'size' bytes from the uio buffer.
1496 1404   * To the specified object.
1497 1405   * Starting at offset uio->uio_loffset.
1498 1406   */
1499 1407  int
1500 1408  dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1501 1409      dmu_tx_t *tx)
1502 1410  {
1503 1411          dnode_t *dn;
1504 1412          int err;
1505 1413  
1506 1414          if (size == 0)
1507 1415                  return (0);
1508 1416  
1509 1417          err = dnode_hold(os, object, FTAG, &dn);
1510 1418          if (err)
1511 1419                  return (err);
1512 1420  
1513 1421          err = dmu_write_uio_dnode(dn, uio, size, tx);
1514 1422  
1515 1423          dnode_rele(dn, FTAG);
1516 1424  
1517 1425          return (err);
1518 1426  }
1519 1427  
1520 1428  int
1521 1429  dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1522 1430      page_t *pp, dmu_tx_t *tx)
1523 1431  {
1524 1432          dmu_buf_t **dbp;
1525 1433          int numbufs, i;
1526 1434          int err;
1527 1435  
1528 1436          if (size == 0)
1529 1437                  return (0);
1530 1438  
1531 1439          err = dmu_buf_hold_array(os, object, offset, size,
1532 1440              FALSE, FTAG, &numbufs, &dbp);
1533 1441          if (err)
1534 1442                  return (err);
1535 1443  
1536 1444          for (i = 0; i < numbufs; i++) {
1537 1445                  int tocpy, copied, thiscpy;
1538 1446                  int bufoff;
1539 1447                  dmu_buf_t *db = dbp[i];
1540 1448                  caddr_t va;
1541 1449  
1542 1450                  ASSERT(size > 0);
1543 1451                  ASSERT3U(db->db_size, >=, PAGESIZE);
1544 1452  
1545 1453                  bufoff = offset - db->db_offset;
1546 1454                  tocpy = (int)MIN(db->db_size - bufoff, size);
1547 1455  
1548 1456                  ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1549 1457  
1550 1458                  if (tocpy == db->db_size)
1551 1459                          dmu_buf_will_fill(db, tx);
1552 1460                  else
1553 1461                          dmu_buf_will_dirty(db, tx);
1554 1462  
1555 1463                  for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1556 1464                          ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1557 1465                          thiscpy = MIN(PAGESIZE, tocpy - copied);
1558 1466                          va = zfs_map_page(pp, S_READ);
1559 1467                          bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1560 1468                          zfs_unmap_page(pp, va);
1561 1469                          pp = pp->p_next;
1562 1470                          bufoff += PAGESIZE;
1563 1471                  }
1564 1472  
1565 1473                  if (tocpy == db->db_size)
1566 1474                          dmu_buf_fill_done(db, tx);
1567 1475  
1568 1476                  offset += tocpy;
1569 1477                  size -= tocpy;
1570 1478          }
1571 1479          dmu_buf_rele_array(dbp, numbufs, FTAG);
1572 1480          return (err);
1573 1481  }
1574 1482  #endif
1575 1483  
1576 1484  /*
1577 1485   * Allocate a loaned anonymous arc buffer.
1578 1486   */
1579 1487  arc_buf_t *
1580 1488  dmu_request_arcbuf(dmu_buf_t *handle, int size)
1581 1489  {
1582 1490          dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1583 1491  
1584 1492          return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
1585 1493  }
1586 1494  
1587 1495  /*
1588 1496   * Free a loaned arc buffer.
1589 1497   */
1590 1498  void
1591 1499  dmu_return_arcbuf(arc_buf_t *buf)
1592 1500  {
1593 1501          arc_return_buf(buf, FTAG);
1594 1502          arc_buf_destroy(buf, FTAG);
1595 1503  }
1596 1504  
1597 1505  /*
1598 1506   * When possible directly assign passed loaned arc buffer to a dbuf.
1599 1507   * If this is not possible copy the contents of passed arc buf via
1600 1508   * dmu_write().
1601 1509   */
1602 1510  void
1603 1511  dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1604 1512      dmu_tx_t *tx)
1605 1513  {
1606 1514          dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1607 1515          dnode_t *dn;
1608 1516          dmu_buf_impl_t *db;
1609 1517          uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
1610 1518          uint64_t blkid;
1611 1519  
1612 1520          DB_DNODE_ENTER(dbuf);
1613 1521          dn = DB_DNODE(dbuf);
1614 1522          rw_enter(&dn->dn_struct_rwlock, RW_READER);
1615 1523          blkid = dbuf_whichblock(dn, 0, offset);
1616 1524          VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1617 1525          rw_exit(&dn->dn_struct_rwlock);
1618 1526          DB_DNODE_EXIT(dbuf);
1619 1527  
1620 1528          /*
1621 1529           * We can only assign if the offset is aligned, the arc buf is the
1622 1530           * same size as the dbuf, and the dbuf is not metadata.
1623 1531           */
1624 1532          if (offset == db->db.db_offset && blksz == db->db.db_size) {
1625 1533                  dbuf_assign_arcbuf(db, buf, tx);
1626 1534                  dbuf_rele(db, FTAG);
1627 1535          } else {
1628 1536                  objset_t *os;
1629 1537                  uint64_t object;
1630 1538  
1631 1539                  /* compressed bufs must always be assignable to their dbuf */
1632 1540                  ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
1633 1541                  ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
1634 1542  
1635 1543                  DB_DNODE_ENTER(dbuf);
1636 1544                  dn = DB_DNODE(dbuf);
1637 1545                  os = dn->dn_objset;
1638 1546                  object = dn->dn_object;
1639 1547                  DB_DNODE_EXIT(dbuf);
1640 1548  
1641 1549                  dbuf_rele(db, FTAG);
1642 1550                  dmu_write(os, object, offset, blksz, buf->b_data, tx);
1643 1551                  dmu_return_arcbuf(buf);
1644 1552                  XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1645 1553          }
1646 1554  }
1647 1555  
1648 1556  typedef struct {
1649 1557          dbuf_dirty_record_t     *dsa_dr;
1650 1558          dmu_sync_cb_t           *dsa_done;
1651 1559          zgd_t                   *dsa_zgd;
1652 1560          dmu_tx_t                *dsa_tx;
1653 1561  } dmu_sync_arg_t;
1654 1562  
1655 1563  /* ARGSUSED */
1656 1564  static void
1657 1565  dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1658 1566  {
1659 1567          dmu_sync_arg_t *dsa = varg;
1660 1568          dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1661 1569          blkptr_t *bp = zio->io_bp;
1662 1570  
1663 1571          if (zio->io_error == 0) {
1664 1572                  if (BP_IS_HOLE(bp)) {
1665 1573                          /*
1666 1574                           * A block of zeros may compress to a hole, but the
1667 1575                           * block size still needs to be known for replay.
1668 1576                           */
1669 1577                          BP_SET_LSIZE(bp, db->db_size);
1670 1578                  } else if (!BP_IS_EMBEDDED(bp)) {
1671 1579                          ASSERT(BP_GET_LEVEL(bp) == 0);
1672 1580                          bp->blk_fill = 1;
1673 1581                  }
1674 1582          }
1675 1583  }
1676 1584  
1677 1585  static void
1678 1586  dmu_sync_late_arrival_ready(zio_t *zio)
1679 1587  {

↓ open down ↓

529 lines elided

↑ open up ↑

1680 1588          dmu_sync_ready(zio, NULL, zio->io_private);
1681 1589  }
1682 1590  
1683 1591  /* ARGSUSED */
1684 1592  static void
1685 1593  dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1686 1594  {
1687 1595          dmu_sync_arg_t *dsa = varg;
1688 1596          dbuf_dirty_record_t *dr = dsa->dsa_dr;
1689 1597          dmu_buf_impl_t *db = dr->dr_dbuf;
     1598 +        zgd_t *zgd = dsa->dsa_zgd;
1690 1599  
     1600 +        /*
     1601 +         * Record the vdev(s) backing this blkptr so they can be flushed after
     1602 +         * the writes for the lwb have completed.
     1603 +         */
     1604 +        if (zio->io_error == 0) {
     1605 +                zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
     1606 +        }
     1607 +
1691 1608          mutex_enter(&db->db_mtx);
1692 1609          ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1693 1610          if (zio->io_error == 0) {
1694 1611                  dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
1695 1612                  if (dr->dt.dl.dr_nopwrite) {
1696 1613                          blkptr_t *bp = zio->io_bp;
1697 1614                          blkptr_t *bp_orig = &zio->io_bp_orig;
1698 1615                          uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
1699 1616  
1700 1617                          ASSERT(BP_EQUAL(bp, bp_orig));

1701 1618                          VERIFY(BP_EQUAL(bp, db->db_blkptr));
1702 1619                          ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
1703 1620                          ASSERT(zio_checksum_table[chksum].ci_flags &
1704 1621                              ZCHECKSUM_FLAG_NOPWRITE);
1705 1622                  }
1706 1623                  dr->dt.dl.dr_overridden_by = *zio->io_bp;
1707 1624                  dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1708 1625                  dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1709 1626  
1710 1627                  /*
1711 1628                   * Old style holes are filled with all zeros, whereas
1712 1629                   * new-style holes maintain their lsize, type, level,
1713 1630                   * and birth time (see zio_write_compress). While we
1714 1631                   * need to reset the BP_SET_LSIZE() call that happened
1715 1632                   * in dmu_sync_ready for old style holes, we do *not*
1716 1633                   * want to wipe out the information contained in new
1717 1634                   * style holes. Thus, only zero out the block pointer if
1718 1635                   * it's an old style hole.
1719 1636                   */
1720 1637                  if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
1721 1638                      dr->dt.dl.dr_overridden_by.blk_birth == 0)
1722 1639                          BP_ZERO(&dr->dt.dl.dr_overridden_by);
1723 1640          } else {
1724 1641                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1725 1642          }
1726 1643          cv_broadcast(&db->db_changed);
1727 1644          mutex_exit(&db->db_mtx);
1728 1645  
1729 1646          dsa->dsa_done(dsa->dsa_zgd, zio->io_error);

↓ open down ↓

29 lines elided

↑ open up ↑

1730 1647  
1731 1648          kmem_free(dsa, sizeof (*dsa));
1732 1649  }
1733 1650  
1734 1651  static void
1735 1652  dmu_sync_late_arrival_done(zio_t *zio)
1736 1653  {
1737 1654          blkptr_t *bp = zio->io_bp;
1738 1655          dmu_sync_arg_t *dsa = zio->io_private;
1739 1656          blkptr_t *bp_orig = &zio->io_bp_orig;
     1657 +        zgd_t *zgd = dsa->dsa_zgd;
1740 1658  
1741      -        if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1742      -                ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
1743      -                ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
1744      -                ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1745      -                ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1746      -                zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
     1659 +        if (zio->io_error == 0) {
     1660 +                /*
     1661 +                 * Record the vdev(s) backing this blkptr so they can be
     1662 +                 * flushed after the writes for the lwb have completed.
     1663 +                 */
     1664 +                zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp);
     1665 +
     1666 +                if (!BP_IS_HOLE(bp)) {
     1667 +                        ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
     1668 +                        ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
     1669 +                        ASSERT(zio->io_bp->blk_birth == zio->io_txg);
     1670 +                        ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
     1671 +                        zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
     1672 +                }
1747 1673          }
1748 1674  
1749 1675          dmu_tx_commit(dsa->dsa_tx);
1750 1676  
1751 1677          dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1752 1678  
1753 1679          abd_put(zio->io_abd);
1754 1680          kmem_free(dsa, sizeof (*dsa));
1755 1681  }
1756 1682  
1757 1683  static int
1758 1684  dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1759      -    zio_prop_t *zp, zbookmark_phys_t *zb)
     1685 +    zio_prop_t *zp, zbookmark_phys_t *zb, const zio_smartcomp_info_t *sc)
1760 1686  {
1761 1687          dmu_sync_arg_t *dsa;
1762 1688          dmu_tx_t *tx;
1763 1689  
1764 1690          tx = dmu_tx_create(os);
1765 1691          dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1766 1692          if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1767 1693                  dmu_tx_abort(tx);
1768 1694                  /* Make zl_get_data do txg_waited_synced() */
1769 1695                  return (SET_ERROR(EIO));

1770 1696          }
1771 1697  
1772 1698          /*
1773 1699           * In order to prevent the zgd's lwb from being free'd prior to
1774 1700           * dmu_sync_late_arrival_done() being called, we have to ensure
1775 1701           * the lwb's "max txg" takes this tx's txg into account.
1776 1702           */
1777 1703          zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
1778 1704  
1779 1705          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1780 1706          dsa->dsa_dr = NULL;
1781 1707          dsa->dsa_done = done;
1782 1708          dsa->dsa_zgd = zgd;
1783 1709          dsa->dsa_tx = tx;
1784 1710  
1785 1711          /*
1786 1712           * Since we are currently syncing this txg, it's nontrivial to
1787 1713           * determine what BP to nopwrite against, so we disable nopwrite.
1788 1714           *
1789 1715           * When syncing, the db_blkptr is initially the BP of the previous
1790 1716           * txg.  We can not nopwrite against it because it will be changed
1791 1717           * (this is similar to the non-late-arrival case where the dbuf is
1792 1718           * dirty in a future txg).
1793 1719           *
1794 1720           * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
1795 1721           * We can not nopwrite against it because although the BP will not
1796 1722           * (typically) be changed, the data has not yet been persisted to this
1797 1723           * location.
1798 1724           *
1799 1725           * Finally, when dbuf_write_done() is called, it is theoretically
1800 1726           * possible to always nopwrite, because the data that was written in
1801 1727           * this txg is the same data that we are trying to write.  However we

↓ open down ↓

32 lines elided

↑ open up ↑

1802 1728           * would need to check that this dbuf is not dirty in any future
1803 1729           * txg's (as we do in the normal dmu_sync() path). For simplicity, we
1804 1730           * don't nopwrite in this case.
1805 1731           */
1806 1732          zp->zp_nopwrite = B_FALSE;
1807 1733  
1808 1734          zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1809 1735              abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
1810 1736              zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
1811 1737              dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
1812      -            dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
     1738 +            dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb, sc));
1813 1739  
1814 1740          return (0);
1815 1741  }
1816 1742  
1817 1743  /*
1818 1744   * Intent log support: sync the block associated with db to disk.
1819 1745   * N.B. and XXX: the caller is responsible for making sure that the
1820 1746   * data isn't changing while dmu_sync() is writing it.
1821 1747   *
1822 1748   * Return values:

1823 1749   *
1824 1750   *      EEXIST: this txg has already been synced, so there's nothing to do.
1825 1751   *              The caller should not log the write.
1826 1752   *
1827 1753   *      ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1828 1754   *              The caller should not log the write.
1829 1755   *
1830 1756   *      EALREADY: this block is already in the process of being synced.
1831 1757   *              The caller should track its progress (somehow).

↓ open down ↓

9 lines elided

↑ open up ↑

1832 1758   *
1833 1759   *      EIO: could not do the I/O.
1834 1760   *              The caller should do a txg_wait_synced().
1835 1761   *
1836 1762   *      0: the I/O has been initiated.
1837 1763   *              The caller should log this blkptr in the done callback.
1838 1764   *              It is possible that the I/O will fail, in which case
1839 1765   *              the error will be reported to the done callback and
1840 1766   *              propagated to pio from zio_done().
1841 1767   */
     1768 +
1842 1769  int
1843 1770  dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1844 1771  {
1845 1772          dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1846 1773          objset_t *os = db->db_objset;
1847 1774          dsl_dataset_t *ds = os->os_dsl_dataset;
1848 1775          dbuf_dirty_record_t *dr;
1849 1776          dmu_sync_arg_t *dsa;
1850 1777          zbookmark_phys_t zb;
1851 1778          zio_prop_t zp;
1852 1779          dnode_t *dn;
     1780 +        int flags = 0;
     1781 +        zio_smartcomp_info_t sc;
1853 1782  
1854 1783          ASSERT(pio != NULL);
1855 1784          ASSERT(txg != 0);
1856 1785  
1857 1786          SET_BOOKMARK(&zb, ds->ds_object,
1858 1787              db->db.db_object, db->db_level, db->db_blkid);
1859 1788  
     1789 +        /* write to special only if proper conditions hold */
     1790 +        if (spa_write_data_to_special(os->os_spa, os))
     1791 +                WP_SET_SPECIALCLASS(flags, B_TRUE);
     1792 +
1860 1793          DB_DNODE_ENTER(db);
1861 1794          dn = DB_DNODE(db);
1862      -        dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
     1795 +        dmu_write_policy(os, dn, db->db_level, flags | WP_DMU_SYNC, &zp);
     1796 +        dnode_setup_zio_smartcomp(db, &sc);
1863 1797          DB_DNODE_EXIT(db);
1864 1798  
1865 1799          /*
1866 1800           * If we're frozen (running ziltest), we always need to generate a bp.
1867 1801           */
1868 1802          if (txg > spa_freeze_txg(os->os_spa))
1869      -                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
     1803 +                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb,
     1804 +                    &sc));
1870 1805  
1871 1806          /*
1872 1807           * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1873 1808           * and us.  If we determine that this txg is not yet syncing,
1874 1809           * but it begins to sync a moment later, that's OK because the
1875 1810           * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1876 1811           */
1877 1812          mutex_enter(&db->db_mtx);
1878 1813  
1879 1814          if (txg <= spa_last_synced_txg(os->os_spa)) {

1880 1815                  /*
1881 1816                   * This txg has already synced.  There's nothing to do.
1882 1817                   */

↓ open down ↓

3 lines elided

↑ open up ↑

1883 1818                  mutex_exit(&db->db_mtx);
1884 1819                  return (SET_ERROR(EEXIST));
1885 1820          }
1886 1821  
1887 1822          if (txg <= spa_syncing_txg(os->os_spa)) {
1888 1823                  /*
1889 1824                   * This txg is currently syncing, so we can't mess with
1890 1825                   * the dirty record anymore; just write a new log block.
1891 1826                   */
1892 1827                  mutex_exit(&db->db_mtx);
1893      -                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
     1828 +                return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb,
     1829 +                    &sc));
1894 1830          }
1895 1831  
1896 1832          dr = db->db_last_dirty;
1897 1833          while (dr && dr->dr_txg != txg)
1898 1834                  dr = dr->dr_next;
1899 1835  
1900 1836          if (dr == NULL) {
1901 1837                  /*
1902 1838                   * There's no dr for this dbuf, so it must have been freed.
1903 1839                   * There's no need to log writes to freed blocks, so we're done.

1904 1840                   */
1905 1841                  mutex_exit(&db->db_mtx);
1906 1842                  return (SET_ERROR(ENOENT));
1907 1843          }
1908 1844  
1909 1845          ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
1910 1846  
1911 1847          if (db->db_blkptr != NULL) {
1912 1848                  /*
1913 1849                   * We need to fill in zgd_bp with the current blkptr so that
1914 1850                   * the nopwrite code can check if we're writing the same
1915 1851                   * data that's already on disk.  We can only nopwrite if we
1916 1852                   * are sure that after making the copy, db_blkptr will not
1917 1853                   * change until our i/o completes.  We ensure this by
1918 1854                   * holding the db_mtx, and only allowing nopwrite if the
1919 1855                   * block is not already dirty (see below).  This is verified
1920 1856                   * by dmu_sync_done(), which VERIFYs that the db_blkptr has
1921 1857                   * not changed.
1922 1858                   */
1923 1859                  *zgd->zgd_bp = *db->db_blkptr;
1924 1860          }
1925 1861  
1926 1862          /*
1927 1863           * Assume the on-disk data is X, the current syncing data (in
1928 1864           * txg - 1) is Y, and the current in-memory data is Z (currently
1929 1865           * in dmu_sync).
1930 1866           *
1931 1867           * We usually want to perform a nopwrite if X and Z are the
1932 1868           * same.  However, if Y is different (i.e. the BP is going to
1933 1869           * change before this write takes effect), then a nopwrite will
1934 1870           * be incorrect - we would override with X, which could have
1935 1871           * been freed when Y was written.
1936 1872           *
1937 1873           * (Note that this is not a concern when we are nop-writing from
1938 1874           * syncing context, because X and Y must be identical, because
1939 1875           * all previous txgs have been synced.)
1940 1876           *
1941 1877           * Therefore, we disable nopwrite if the current BP could change
1942 1878           * before this TXG.  There are two ways it could change: by
1943 1879           * being dirty (dr_next is non-NULL), or by being freed
1944 1880           * (dnode_block_freed()).  This behavior is verified by
1945 1881           * zio_done(), which VERIFYs that the override BP is identical
1946 1882           * to the on-disk BP.
1947 1883           */
1948 1884          DB_DNODE_ENTER(db);
1949 1885          dn = DB_DNODE(db);
1950 1886          if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
1951 1887                  zp.zp_nopwrite = B_FALSE;
1952 1888          DB_DNODE_EXIT(db);
1953 1889  
1954 1890          ASSERT(dr->dr_txg == txg);
1955 1891          if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1956 1892              dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1957 1893                  /*
1958 1894                   * We have already issued a sync write for this buffer,
1959 1895                   * or this buffer has already been synced.  It could not
1960 1896                   * have been dirtied since, or we would have cleared the state.
1961 1897                   */
1962 1898                  mutex_exit(&db->db_mtx);
1963 1899                  return (SET_ERROR(EALREADY));
1964 1900          }
1965 1901  
1966 1902          ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1967 1903          dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1968 1904          mutex_exit(&db->db_mtx);

↓ open down ↓

65 lines elided

↑ open up ↑

1969 1905  
1970 1906          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1971 1907          dsa->dsa_dr = dr;
1972 1908          dsa->dsa_done = done;
1973 1909          dsa->dsa_zgd = zgd;
1974 1910          dsa->dsa_tx = NULL;
1975 1911  
1976 1912          zio_nowait(arc_write(pio, os->os_spa, txg,
1977 1913              zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
1978 1914              &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
1979      -            ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
     1915 +            ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb, &sc));
1980 1916  
1981 1917          return (0);
1982 1918  }
1983 1919  
1984 1920  int
1985 1921  dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1986 1922      dmu_tx_t *tx)
1987 1923  {
1988 1924          dnode_t *dn;
1989 1925          int err;

1990 1926  
1991 1927          err = dnode_hold(os, object, FTAG, &dn);
1992 1928          if (err)
1993 1929                  return (err);
1994 1930          err = dnode_set_blksz(dn, size, ibs, tx);
1995 1931          dnode_rele(dn, FTAG);
1996 1932          return (err);
1997 1933  }
1998 1934  
1999 1935  void
2000 1936  dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
2001 1937      dmu_tx_t *tx)
2002 1938  {
2003 1939          dnode_t *dn;
2004 1940  
2005 1941          /*
2006 1942           * Send streams include each object's checksum function.  This
2007 1943           * check ensures that the receiving system can understand the
2008 1944           * checksum function transmitted.
2009 1945           */
2010 1946          ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
2011 1947  
2012 1948          VERIFY0(dnode_hold(os, object, FTAG, &dn));
2013 1949          ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
2014 1950          dn->dn_checksum = checksum;
2015 1951          dnode_setdirty(dn, tx);
2016 1952          dnode_rele(dn, FTAG);
2017 1953  }
2018 1954  
2019 1955  void
2020 1956  dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
2021 1957      dmu_tx_t *tx)
2022 1958  {
2023 1959          dnode_t *dn;
2024 1960  
2025 1961          /*
2026 1962           * Send streams include each object's compression function.  This
2027 1963           * check ensures that the receiving system can understand the
2028 1964           * compression function transmitted.
2029 1965           */
2030 1966          ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
2031 1967  
2032 1968          VERIFY0(dnode_hold(os, object, FTAG, &dn));
2033 1969          dn->dn_compress = compress;
2034 1970          dnode_setdirty(dn, tx);
2035 1971          dnode_rele(dn, FTAG);
2036 1972  }
2037 1973  
2038 1974  int zfs_mdcomp_disable = 0;
2039 1975  
2040 1976  /*
2041 1977   * When the "redundant_metadata" property is set to "most", only indirect
2042 1978   * blocks of this level and higher will have an additional ditto block.
2043 1979   */
2044 1980  int zfs_redundant_metadata_most_ditto_level = 2;
2045 1981  
2046 1982  void
2047 1983  dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
2048 1984  {
2049 1985          dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
2050 1986          boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
2051 1987              (wp & WP_SPILL));
2052 1988          enum zio_checksum checksum = os->os_checksum;
2053 1989          enum zio_compress compress = os->os_compress;
2054 1990          enum zio_checksum dedup_checksum = os->os_dedup_checksum;
2055 1991          boolean_t dedup = B_FALSE;
2056 1992          boolean_t nopwrite = B_FALSE;
2057 1993          boolean_t dedup_verify = os->os_dedup_verify;
2058 1994          int copies = os->os_copies;
2059 1995  
2060 1996          /*
2061 1997           * We maintain different write policies for each of the following
2062 1998           * types of data:
2063 1999           *       1. metadata
2064 2000           *       2. preallocated blocks (i.e. level-0 blocks of a dump device)
2065 2001           *       3. all other level 0 blocks
2066 2002           */
2067 2003          if (ismd) {
2068 2004                  if (zfs_mdcomp_disable) {
2069 2005                          compress = ZIO_COMPRESS_EMPTY;
2070 2006                  } else {
2071 2007                          /*
2072 2008                           * XXX -- we should design a compression algorithm
2073 2009                           * that specializes in arrays of bps.
2074 2010                           */
2075 2011                          compress = zio_compress_select(os->os_spa,
2076 2012                              ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
2077 2013                  }
2078 2014  
2079 2015                  /*
2080 2016                   * Metadata always gets checksummed.  If the data
2081 2017                   * checksum is multi-bit correctable, and it's not a
2082 2018                   * ZBT-style checksum, then it's suitable for metadata
2083 2019                   * as well.  Otherwise, the metadata checksum defaults
2084 2020                   * to fletcher4.
2085 2021                   */
2086 2022                  if (!(zio_checksum_table[checksum].ci_flags &
2087 2023                      ZCHECKSUM_FLAG_METADATA) ||
2088 2024                      (zio_checksum_table[checksum].ci_flags &
2089 2025                      ZCHECKSUM_FLAG_EMBEDDED))
2090 2026                          checksum = ZIO_CHECKSUM_FLETCHER_4;
2091 2027  
2092 2028                  if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
2093 2029                      (os->os_redundant_metadata ==
2094 2030                      ZFS_REDUNDANT_METADATA_MOST &&
2095 2031                      (level >= zfs_redundant_metadata_most_ditto_level ||
2096 2032                      DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
2097 2033                          copies++;
2098 2034          } else if (wp & WP_NOFILL) {
2099 2035                  ASSERT(level == 0);
2100 2036  
2101 2037                  /*
2102 2038                   * If we're writing preallocated blocks, we aren't actually
2103 2039                   * writing them so don't set any policy properties.  These
2104 2040                   * blocks are currently only used by an external subsystem
2105 2041                   * outside of zfs (i.e. dump) and not written by the zio
2106 2042                   * pipeline.
2107 2043                   */
2108 2044                  compress = ZIO_COMPRESS_OFF;
2109 2045                  checksum = ZIO_CHECKSUM_NOPARITY;
2110 2046          } else {
2111 2047                  compress = zio_compress_select(os->os_spa, dn->dn_compress,
2112 2048                      compress);
2113 2049  
2114 2050                  checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
2115 2051                      zio_checksum_select(dn->dn_checksum, checksum) :
2116 2052                      dedup_checksum;
2117 2053  
2118 2054                  /*
2119 2055                   * Determine dedup setting.  If we are in dmu_sync(),
2120 2056                   * we won't actually dedup now because that's all
2121 2057                   * done in syncing context; but we do want to use the
2122 2058                   * dedup checkum.  If the checksum is not strong
2123 2059                   * enough to ensure unique signatures, force
2124 2060                   * dedup_verify.
2125 2061                   */
2126 2062                  if (dedup_checksum != ZIO_CHECKSUM_OFF) {
2127 2063                          dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
2128 2064                          if (!(zio_checksum_table[checksum].ci_flags &
2129 2065                              ZCHECKSUM_FLAG_DEDUP))
2130 2066                                  dedup_verify = B_TRUE;
2131 2067                  }
2132 2068  
2133 2069                  /*
2134 2070                   * Enable nopwrite if we have secure enough checksum

↓ open down ↓

145 lines elided

↑ open up ↑

2135 2071                   * algorithm (see comment in zio_nop_write) and
2136 2072                   * compression is enabled.  We don't enable nopwrite if
2137 2073                   * dedup is enabled as the two features are mutually
2138 2074                   * exclusive.
2139 2075                   */
2140 2076                  nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
2141 2077                      ZCHECKSUM_FLAG_NOPWRITE) &&
2142 2078                      compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
2143 2079          }
2144 2080  
     2081 +        zp->zp_usesc = WP_GET_SPECIALCLASS(wp);
2145 2082          zp->zp_checksum = checksum;
2146 2083          zp->zp_compress = compress;
2147 2084          ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
2148 2085  
2149 2086          zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
2150 2087          zp->zp_level = level;
2151 2088          zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
2152 2089          zp->zp_dedup = dedup;
2153 2090          zp->zp_dedup_verify = dedup && dedup_verify;
     2091 +        zp->zp_metadata = ismd;
2154 2092          zp->zp_nopwrite = nopwrite;
     2093 +        zp->zp_zpl_meta_to_special = os->os_zpl_meta_to_special;
     2094 +        zp->zp_usewbc = (zp->zp_usesc &&
     2095 +            os->os_wbc_mode == ZFS_WBC_MODE_ON && !ismd);
     2096 +
     2097 +        /* explicitly control the number for copies for DDT */
     2098 +        if (DMU_OT_IS_DDT_META(type) &&
     2099 +            os->os_spa->spa_ddt_meta_copies > 0) {
     2100 +                zp->zp_copies =
     2101 +                    MIN(os->os_spa->spa_ddt_meta_copies,
     2102 +                    spa_max_replication(os->os_spa));
     2103 +        }
     2104 +
     2105 +        DTRACE_PROBE2(dmu_wp, boolean_t, zp->zp_metadata,
     2106 +            boolean_t, zp->zp_usesc);
2155 2107  }
2156 2108  
2157 2109  int
2158 2110  dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
2159 2111  {
2160 2112          dnode_t *dn;
2161 2113          int err;
2162 2114  
2163 2115          /*
2164 2116           * Sync any current changes before

2165 2117           * we go trundling through the block pointers.
2166 2118           */
2167 2119          err = dmu_object_wait_synced(os, object);
2168 2120          if (err) {
2169 2121                  return (err);
2170 2122          }
2171 2123  
2172 2124          err = dnode_hold(os, object, FTAG, &dn);
2173 2125          if (err) {
2174 2126                  return (err);
2175 2127          }
2176 2128  
2177 2129          err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
2178 2130          dnode_rele(dn, FTAG);
2179 2131  
2180 2132          return (err);
2181 2133  }
2182 2134  
2183 2135  /*
2184 2136   * Given the ZFS object, if it contains any dirty nodes
2185 2137   * this function flushes all dirty blocks to disk. This
2186 2138   * ensures the DMU object info is updated. A more efficient
2187 2139   * future version might just find the TXG with the maximum
2188 2140   * ID and wait for that to be synced.
2189 2141   */
2190 2142  int
2191 2143  dmu_object_wait_synced(objset_t *os, uint64_t object)
2192 2144  {
2193 2145          dnode_t *dn;
2194 2146          int error, i;
2195 2147  
2196 2148          error = dnode_hold(os, object, FTAG, &dn);
2197 2149          if (error) {
2198 2150                  return (error);
2199 2151          }
2200 2152  
2201 2153          for (i = 0; i < TXG_SIZE; i++) {
2202 2154                  if (list_link_active(&dn->dn_dirty_link[i])) {
2203 2155                          break;
2204 2156                  }
2205 2157          }
2206 2158          dnode_rele(dn, FTAG);
2207 2159          if (i != TXG_SIZE) {
2208 2160                  txg_wait_synced(dmu_objset_pool(os), 0);
2209 2161          }
2210 2162  
2211 2163          return (0);
2212 2164  }
2213 2165  
2214 2166  void
2215 2167  dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
2216 2168  {
2217 2169          dnode_phys_t *dnp;
2218 2170  
2219 2171          rw_enter(&dn->dn_struct_rwlock, RW_READER);
2220 2172          mutex_enter(&dn->dn_mtx);
2221 2173  
2222 2174          dnp = dn->dn_phys;
2223 2175  
2224 2176          doi->doi_data_block_size = dn->dn_datablksz;
2225 2177          doi->doi_metadata_block_size = dn->dn_indblkshift ?
2226 2178              1ULL << dn->dn_indblkshift : 0;
2227 2179          doi->doi_type = dn->dn_type;
2228 2180          doi->doi_bonus_type = dn->dn_bonustype;
2229 2181          doi->doi_bonus_size = dn->dn_bonuslen;
2230 2182          doi->doi_indirection = dn->dn_nlevels;
2231 2183          doi->doi_checksum = dn->dn_checksum;
2232 2184          doi->doi_compress = dn->dn_compress;
2233 2185          doi->doi_nblkptr = dn->dn_nblkptr;
2234 2186          doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
2235 2187          doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
2236 2188          doi->doi_fill_count = 0;
2237 2189          for (int i = 0; i < dnp->dn_nblkptr; i++)
2238 2190                  doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
2239 2191  
2240 2192          mutex_exit(&dn->dn_mtx);
2241 2193          rw_exit(&dn->dn_struct_rwlock);
2242 2194  }
2243 2195  
2244 2196  /*
2245 2197   * Get information on a DMU object.
2246 2198   * If doi is NULL, just indicates whether the object exists.
2247 2199   */
2248 2200  int
2249 2201  dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
2250 2202  {
2251 2203          dnode_t *dn;
2252 2204          int err = dnode_hold(os, object, FTAG, &dn);
2253 2205  
2254 2206          if (err)
2255 2207                  return (err);
2256 2208  
2257 2209          if (doi != NULL)
2258 2210                  dmu_object_info_from_dnode(dn, doi);
2259 2211  
2260 2212          dnode_rele(dn, FTAG);
2261 2213          return (0);
2262 2214  }
2263 2215  
2264 2216  /*
2265 2217   * As above, but faster; can be used when you have a held dbuf in hand.
2266 2218   */
2267 2219  void
2268 2220  dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
2269 2221  {
2270 2222          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2271 2223  
2272 2224          DB_DNODE_ENTER(db);
2273 2225          dmu_object_info_from_dnode(DB_DNODE(db), doi);
2274 2226          DB_DNODE_EXIT(db);
2275 2227  }
2276 2228  
2277 2229  /*
2278 2230   * Faster still when you only care about the size.
2279 2231   * This is specifically optimized for zfs_getattr().
2280 2232   */
2281 2233  void
2282 2234  dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
2283 2235      u_longlong_t *nblk512)
2284 2236  {
2285 2237          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
2286 2238          dnode_t *dn;
2287 2239  
2288 2240          DB_DNODE_ENTER(db);
2289 2241          dn = DB_DNODE(db);
2290 2242  
2291 2243          *blksize = dn->dn_datablksz;
2292 2244          /* add 1 for dnode space */
2293 2245          *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
2294 2246              SPA_MINBLOCKSHIFT) + 1;
2295 2247          DB_DNODE_EXIT(db);
2296 2248  }
2297 2249  
2298 2250  void
2299 2251  byteswap_uint64_array(void *vbuf, size_t size)
2300 2252  {
2301 2253          uint64_t *buf = vbuf;
2302 2254          size_t count = size >> 3;
2303 2255          int i;
2304 2256  
2305 2257          ASSERT((size & 7) == 0);
2306 2258  
2307 2259          for (i = 0; i < count; i++)
2308 2260                  buf[i] = BSWAP_64(buf[i]);
2309 2261  }
2310 2262  
2311 2263  void
2312 2264  byteswap_uint32_array(void *vbuf, size_t size)
2313 2265  {
2314 2266          uint32_t *buf = vbuf;
2315 2267          size_t count = size >> 2;
2316 2268          int i;
2317 2269  
2318 2270          ASSERT((size & 3) == 0);
2319 2271  
2320 2272          for (i = 0; i < count; i++)
2321 2273                  buf[i] = BSWAP_32(buf[i]);
2322 2274  }
2323 2275  
2324 2276  void
2325 2277  byteswap_uint16_array(void *vbuf, size_t size)
2326 2278  {
2327 2279          uint16_t *buf = vbuf;
2328 2280          size_t count = size >> 1;
2329 2281          int i;
2330 2282  
2331 2283          ASSERT((size & 1) == 0);
2332 2284  
2333 2285          for (i = 0; i < count; i++)
2334 2286                  buf[i] = BSWAP_16(buf[i]);
2335 2287  }
2336 2288  
2337 2289  /* ARGSUSED */
2338 2290  void
2339 2291  byteswap_uint8_array(void *vbuf, size_t size)
2340 2292  {
2341 2293  }
2342 2294  
2343 2295  void
2344 2296  dmu_init(void)
2345 2297  {
2346 2298          abd_init();
2347 2299          zfs_dbgmsg_init();
2348 2300          sa_cache_init();
2349 2301          xuio_stat_init();
2350 2302          dmu_objset_init();
2351 2303          dnode_init();
2352 2304          zfetch_init();
2353 2305          l2arc_init();
2354 2306          arc_init();
2355 2307          dbuf_init();
2356 2308  }
2357 2309  
2358 2310  void
2359 2311  dmu_fini(void)
2360 2312  {
2361 2313          arc_fini(); /* arc depends on l2arc, so arc must go first */
2362 2314          l2arc_fini();
2363 2315          zfetch_fini();
2364 2316          dbuf_fini();
2365 2317          dnode_fini();
2366 2318          dmu_objset_fini();
2367 2319          xuio_stat_fini();
2368 2320          sa_cache_fini();
2369 2321          zfs_dbgmsg_fini();
2370 2322          abd_fini();
2371 2323  }

↓ open down ↓

207 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX