Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dmu.c
          +++ new/usr/src/uts/common/fs/zfs/dmu.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
       23 + * Copyright (c) 2012 by Delphix. All rights reserved.
  23   24   */
  24   25  
  25   26  #include <sys/dmu.h>
  26   27  #include <sys/dmu_impl.h>
  27   28  #include <sys/dmu_tx.h>
  28   29  #include <sys/dbuf.h>
  29   30  #include <sys/dnode.h>
  30   31  #include <sys/zfs_context.h>
  31   32  #include <sys/dmu_objset.h>
  32   33  #include <sys/dmu_traverse.h>
  33   34  #include <sys/dsl_dataset.h>
  34   35  #include <sys/dsl_dir.h>
  35   36  #include <sys/dsl_pool.h>
  36   37  #include <sys/dsl_synctask.h>
  37   38  #include <sys/dsl_prop.h>
  38   39  #include <sys/dmu_zfetch.h>
  
    | 
      ↓ open down ↓ | 
    6 lines elided | 
    
      ↑ open up ↑ | 
  
  39   40  #include <sys/zfs_ioctl.h>
  40   41  #include <sys/zap.h>
  41   42  #include <sys/zio_checksum.h>
  42   43  #include <sys/sa.h>
  43   44  #ifdef _KERNEL
  44   45  #include <sys/vmsystm.h>
  45   46  #include <sys/zfs_znode.h>
  46   47  #endif
  47   48  
  48   49  const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
  49      -        {       byteswap_uint8_array,   TRUE,   "unallocated"           },
  50      -        {       zap_byteswap,           TRUE,   "object directory"      },
  51      -        {       byteswap_uint64_array,  TRUE,   "object array"          },
  52      -        {       byteswap_uint8_array,   TRUE,   "packed nvlist"         },
  53      -        {       byteswap_uint64_array,  TRUE,   "packed nvlist size"    },
  54      -        {       byteswap_uint64_array,  TRUE,   "bpobj"                 },
  55      -        {       byteswap_uint64_array,  TRUE,   "bpobj header"          },
  56      -        {       byteswap_uint64_array,  TRUE,   "SPA space map header"  },
  57      -        {       byteswap_uint64_array,  TRUE,   "SPA space map"         },
  58      -        {       byteswap_uint64_array,  TRUE,   "ZIL intent log"        },
  59      -        {       dnode_buf_byteswap,     TRUE,   "DMU dnode"             },
  60      -        {       dmu_objset_byteswap,    TRUE,   "DMU objset"            },
  61      -        {       byteswap_uint64_array,  TRUE,   "DSL directory"         },
  62      -        {       zap_byteswap,           TRUE,   "DSL directory child map"},
  63      -        {       zap_byteswap,           TRUE,   "DSL dataset snap map"  },
  64      -        {       zap_byteswap,           TRUE,   "DSL props"             },
  65      -        {       byteswap_uint64_array,  TRUE,   "DSL dataset"           },
  66      -        {       zfs_znode_byteswap,     TRUE,   "ZFS znode"             },
  67      -        {       zfs_oldacl_byteswap,    TRUE,   "ZFS V0 ACL"            },
  68      -        {       byteswap_uint8_array,   FALSE,  "ZFS plain file"        },
  69      -        {       zap_byteswap,           TRUE,   "ZFS directory"         },
  70      -        {       zap_byteswap,           TRUE,   "ZFS master node"       },
  71      -        {       zap_byteswap,           TRUE,   "ZFS delete queue"      },
  72      -        {       byteswap_uint8_array,   FALSE,  "zvol object"           },
  73      -        {       zap_byteswap,           TRUE,   "zvol prop"             },
  74      -        {       byteswap_uint8_array,   FALSE,  "other uint8[]"         },
  75      -        {       byteswap_uint64_array,  FALSE,  "other uint64[]"        },
  76      -        {       zap_byteswap,           TRUE,   "other ZAP"             },
  77      -        {       zap_byteswap,           TRUE,   "persistent error log"  },
  78      -        {       byteswap_uint8_array,   TRUE,   "SPA history"           },
  79      -        {       byteswap_uint64_array,  TRUE,   "SPA history offsets"   },
  80      -        {       zap_byteswap,           TRUE,   "Pool properties"       },
  81      -        {       zap_byteswap,           TRUE,   "DSL permissions"       },
  82      -        {       zfs_acl_byteswap,       TRUE,   "ZFS ACL"               },
  83      -        {       byteswap_uint8_array,   TRUE,   "ZFS SYSACL"            },
  84      -        {       byteswap_uint8_array,   TRUE,   "FUID table"            },
  85      -        {       byteswap_uint64_array,  TRUE,   "FUID table size"       },
  86      -        {       zap_byteswap,           TRUE,   "DSL dataset next clones"},
  87      -        {       zap_byteswap,           TRUE,   "scan work queue"       },
  88      -        {       zap_byteswap,           TRUE,   "ZFS user/group used"   },
  89      -        {       zap_byteswap,           TRUE,   "ZFS user/group quota"  },
  90      -        {       zap_byteswap,           TRUE,   "snapshot refcount tags"},
  91      -        {       zap_byteswap,           TRUE,   "DDT ZAP algorithm"     },
  92      -        {       zap_byteswap,           TRUE,   "DDT statistics"        },
  93      -        {       byteswap_uint8_array,   TRUE,   "System attributes"     },
  94      -        {       zap_byteswap,           TRUE,   "SA master node"        },
  95      -        {       zap_byteswap,           TRUE,   "SA attr registration"  },
  96      -        {       zap_byteswap,           TRUE,   "SA attr layouts"       },
  97      -        {       zap_byteswap,           TRUE,   "scan translations"     },
  98      -        {       byteswap_uint8_array,   FALSE,  "deduplicated block"    },
  99      -        {       zap_byteswap,           TRUE,   "DSL deadlist map"      },
 100      -        {       byteswap_uint64_array,  TRUE,   "DSL deadlist map hdr"  },
 101      -        {       zap_byteswap,           TRUE,   "DSL dir clones"        },
 102      -        {       byteswap_uint64_array,  TRUE,   "bpobj subobj"          },
       50 +        {       DMU_BSWAP_UINT8,        TRUE,   "unallocated"           },
       51 +        {       DMU_BSWAP_ZAP,          TRUE,   "object directory"      },
       52 +        {       DMU_BSWAP_UINT64,       TRUE,   "object array"          },
       53 +        {       DMU_BSWAP_UINT8,        TRUE,   "packed nvlist"         },
       54 +        {       DMU_BSWAP_UINT64,       TRUE,   "packed nvlist size"    },
       55 +        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj"                 },
       56 +        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj header"          },
       57 +        {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map header"  },
       58 +        {       DMU_BSWAP_UINT64,       TRUE,   "SPA space map"         },
       59 +        {       DMU_BSWAP_UINT64,       TRUE,   "ZIL intent log"        },
       60 +        {       DMU_BSWAP_DNODE,        TRUE,   "DMU dnode"             },
       61 +        {       DMU_BSWAP_OBJSET,       TRUE,   "DMU objset"            },
       62 +        {       DMU_BSWAP_UINT64,       TRUE,   "DSL directory"         },
       63 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL directory child map"},
       64 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset snap map"  },
       65 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL props"             },
       66 +        {       DMU_BSWAP_UINT64,       TRUE,   "DSL dataset"           },
       67 +        {       DMU_BSWAP_ZNODE,        TRUE,   "ZFS znode"             },
       68 +        {       DMU_BSWAP_OLDACL,       TRUE,   "ZFS V0 ACL"            },
       69 +        {       DMU_BSWAP_UINT8,        FALSE,  "ZFS plain file"        },
       70 +        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS directory"         },
       71 +        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS master node"       },
       72 +        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS delete queue"      },
       73 +        {       DMU_BSWAP_UINT8,        FALSE,  "zvol object"           },
       74 +        {       DMU_BSWAP_ZAP,          TRUE,   "zvol prop"             },
       75 +        {       DMU_BSWAP_UINT8,        FALSE,  "other uint8[]"         },
       76 +        {       DMU_BSWAP_UINT64,       FALSE,  "other uint64[]"        },
       77 +        {       DMU_BSWAP_ZAP,          TRUE,   "other ZAP"             },
       78 +        {       DMU_BSWAP_ZAP,          TRUE,   "persistent error log"  },
       79 +        {       DMU_BSWAP_UINT8,        TRUE,   "SPA history"           },
       80 +        {       DMU_BSWAP_UINT64,       TRUE,   "SPA history offsets"   },
       81 +        {       DMU_BSWAP_ZAP,          TRUE,   "Pool properties"       },
       82 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL permissions"       },
       83 +        {       DMU_BSWAP_ACL,          TRUE,   "ZFS ACL"               },
       84 +        {       DMU_BSWAP_UINT8,        TRUE,   "ZFS SYSACL"            },
       85 +        {       DMU_BSWAP_UINT8,        TRUE,   "FUID table"            },
       86 +        {       DMU_BSWAP_UINT64,       TRUE,   "FUID table size"       },
       87 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dataset next clones"},
       88 +        {       DMU_BSWAP_ZAP,          TRUE,   "scan work queue"       },
       89 +        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group used"   },
       90 +        {       DMU_BSWAP_ZAP,          TRUE,   "ZFS user/group quota"  },
       91 +        {       DMU_BSWAP_ZAP,          TRUE,   "snapshot refcount tags"},
       92 +        {       DMU_BSWAP_ZAP,          TRUE,   "DDT ZAP algorithm"     },
       93 +        {       DMU_BSWAP_ZAP,          TRUE,   "DDT statistics"        },
       94 +        {       DMU_BSWAP_UINT8,        TRUE,   "System attributes"     },
       95 +        {       DMU_BSWAP_ZAP,          TRUE,   "SA master node"        },
       96 +        {       DMU_BSWAP_ZAP,          TRUE,   "SA attr registration"  },
       97 +        {       DMU_BSWAP_ZAP,          TRUE,   "SA attr layouts"       },
       98 +        {       DMU_BSWAP_ZAP,          TRUE,   "scan translations"     },
       99 +        {       DMU_BSWAP_UINT8,        FALSE,  "deduplicated block"    },
      100 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL deadlist map"      },
      101 +        {       DMU_BSWAP_UINT64,       TRUE,   "DSL deadlist map hdr"  },
      102 +        {       DMU_BSWAP_ZAP,          TRUE,   "DSL dir clones"        },
      103 +        {       DMU_BSWAP_UINT64,       TRUE,   "bpobj subobj"          }
 103  104  };
 104  105  
      106 +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
      107 +        {       byteswap_uint8_array,   "uint8"         },
      108 +        {       byteswap_uint16_array,  "uint16"        },
      109 +        {       byteswap_uint32_array,  "uint32"        },
      110 +        {       byteswap_uint64_array,  "uint64"        },
      111 +        {       zap_byteswap,           "zap"           },
      112 +        {       dnode_buf_byteswap,     "dnode"         },
      113 +        {       dmu_objset_byteswap,    "objset"        },
      114 +        {       zfs_znode_byteswap,     "znode"         },
      115 +        {       zfs_oldacl_byteswap,    "oldacl"        },
      116 +        {       zfs_acl_byteswap,       "acl"           }
      117 +};
      118 +
 105  119  int
 106  120  dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
 107  121      void *tag, dmu_buf_t **dbp, int flags)
 108  122  {
 109  123          dnode_t *dn;
 110  124          uint64_t blkid;
 111  125          dmu_buf_impl_t *db;
 112  126          int err;
 113  127          int db_flags = DB_RF_CANFAIL;
 114  128  
 115  129          if (flags & DMU_READ_NO_PREFETCH)
 116  130                  db_flags |= DB_RF_NOPREFETCH;
 117  131  
 118  132          err = dnode_hold(os, object, FTAG, &dn);
 119  133          if (err)
 120  134                  return (err);
 121  135          blkid = dbuf_whichblock(dn, offset);
 122  136          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 123  137          db = dbuf_hold(dn, blkid, tag);
 124  138          rw_exit(&dn->dn_struct_rwlock);
 125  139          if (db == NULL) {
 126  140                  err = EIO;
 127  141          } else {
 128  142                  err = dbuf_read(db, NULL, db_flags);
 129  143                  if (err) {
 130  144                          dbuf_rele(db, tag);
 131  145                          db = NULL;
 132  146                  }
 133  147          }
 134  148  
 135  149          dnode_rele(dn, FTAG);
 136  150          *dbp = &db->db; /* NULL db plus first field offset is NULL */
 137  151          return (err);
 138  152  }
 139  153  
 140  154  int
 141  155  dmu_bonus_max(void)
 142  156  {
 143  157          return (DN_MAX_BONUSLEN);
 144  158  }
 145  159  
 146  160  int
 147  161  dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 148  162  {
 149  163          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 150  164          dnode_t *dn;
 151  165          int error;
 152  166  
 153  167          DB_DNODE_ENTER(db);
 154  168          dn = DB_DNODE(db);
 155  169  
 156  170          if (dn->dn_bonus != db) {
 157  171                  error = EINVAL;
 158  172          } else if (newsize < 0 || newsize > db_fake->db_size) {
 159  173                  error = EINVAL;
 160  174          } else {
 161  175                  dnode_setbonuslen(dn, newsize, tx);
 162  176                  error = 0;
 163  177          }
 164  178  
 165  179          DB_DNODE_EXIT(db);
 166  180          return (error);
 167  181  }
 168  182  
  
    | 
      ↓ open down ↓ | 
    54 lines elided | 
    
      ↑ open up ↑ | 
  
 169  183  int
 170  184  dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 171  185  {
 172  186          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 173  187          dnode_t *dn;
 174  188          int error;
 175  189  
 176  190          DB_DNODE_ENTER(db);
 177  191          dn = DB_DNODE(db);
 178  192  
 179      -        if (type > DMU_OT_NUMTYPES) {
      193 +        if (!DMU_OT_IS_VALID(type)) {
 180  194                  error = EINVAL;
 181  195          } else if (dn->dn_bonus != db) {
 182  196                  error = EINVAL;
 183  197          } else {
 184  198                  dnode_setbonus_type(dn, type, tx);
 185  199                  error = 0;
 186  200          }
 187  201  
 188  202          DB_DNODE_EXIT(db);
 189  203          return (error);
 190  204  }
 191  205  
 192  206  dmu_object_type_t
 193  207  dmu_get_bonustype(dmu_buf_t *db_fake)
 194  208  {
 195  209          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 196  210          dnode_t *dn;
 197  211          dmu_object_type_t type;
 198  212  
 199  213          DB_DNODE_ENTER(db);
 200  214          dn = DB_DNODE(db);
 201  215          type = dn->dn_bonustype;
 202  216          DB_DNODE_EXIT(db);
 203  217  
 204  218          return (type);
 205  219  }
 206  220  
 207  221  int
 208  222  dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 209  223  {
 210  224          dnode_t *dn;
 211  225          int error;
 212  226  
 213  227          error = dnode_hold(os, object, FTAG, &dn);
 214  228          dbuf_rm_spill(dn, tx);
 215  229          rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 216  230          dnode_rm_spill(dn, tx);
 217  231          rw_exit(&dn->dn_struct_rwlock);
 218  232          dnode_rele(dn, FTAG);
 219  233          return (error);
 220  234  }
 221  235  
 222  236  /*
 223  237   * returns ENOENT, EIO, or 0.
 224  238   */
 225  239  int
 226  240  dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 227  241  {
 228  242          dnode_t *dn;
 229  243          dmu_buf_impl_t *db;
 230  244          int error;
 231  245  
 232  246          error = dnode_hold(os, object, FTAG, &dn);
 233  247          if (error)
 234  248                  return (error);
 235  249  
 236  250          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 237  251          if (dn->dn_bonus == NULL) {
 238  252                  rw_exit(&dn->dn_struct_rwlock);
 239  253                  rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 240  254                  if (dn->dn_bonus == NULL)
 241  255                          dbuf_create_bonus(dn);
 242  256          }
 243  257          db = dn->dn_bonus;
 244  258  
 245  259          /* as long as the bonus buf is held, the dnode will be held */
 246  260          if (refcount_add(&db->db_holds, tag) == 1) {
 247  261                  VERIFY(dnode_add_ref(dn, db));
 248  262                  (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
 249  263          }
 250  264  
 251  265          /*
 252  266           * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 253  267           * hold and incrementing the dbuf count to ensure that dnode_move() sees
 254  268           * a dnode hold for every dbuf.
 255  269           */
 256  270          rw_exit(&dn->dn_struct_rwlock);
 257  271  
 258  272          dnode_rele(dn, FTAG);
 259  273  
 260  274          VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
 261  275  
 262  276          *dbp = &db->db;
 263  277          return (0);
 264  278  }
 265  279  
 266  280  /*
 267  281   * returns ENOENT, EIO, or 0.
 268  282   *
 269  283   * This interface will allocate a blank spill dbuf when a spill blk
 270  284   * doesn't already exist on the dnode.
 271  285   *
 272  286   * if you only want to find an already existing spill db, then
 273  287   * dmu_spill_hold_existing() should be used.
 274  288   */
 275  289  int
 276  290  dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 277  291  {
 278  292          dmu_buf_impl_t *db = NULL;
 279  293          int err;
 280  294  
 281  295          if ((flags & DB_RF_HAVESTRUCT) == 0)
 282  296                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 283  297  
 284  298          db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 285  299  
 286  300          if ((flags & DB_RF_HAVESTRUCT) == 0)
 287  301                  rw_exit(&dn->dn_struct_rwlock);
 288  302  
 289  303          ASSERT(db != NULL);
 290  304          err = dbuf_read(db, NULL, flags);
 291  305          if (err == 0)
 292  306                  *dbp = &db->db;
 293  307          else
 294  308                  dbuf_rele(db, tag);
 295  309          return (err);
 296  310  }
 297  311  
 298  312  int
 299  313  dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 300  314  {
 301  315          dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 302  316          dnode_t *dn;
 303  317          int err;
 304  318  
 305  319          DB_DNODE_ENTER(db);
 306  320          dn = DB_DNODE(db);
 307  321  
 308  322          if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 309  323                  err = EINVAL;
 310  324          } else {
 311  325                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 312  326  
 313  327                  if (!dn->dn_have_spill) {
 314  328                          err = ENOENT;
 315  329                  } else {
 316  330                          err = dmu_spill_hold_by_dnode(dn,
 317  331                              DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 318  332                  }
 319  333  
 320  334                  rw_exit(&dn->dn_struct_rwlock);
 321  335          }
 322  336  
 323  337          DB_DNODE_EXIT(db);
 324  338          return (err);
 325  339  }
 326  340  
 327  341  int
 328  342  dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 329  343  {
 330  344          dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 331  345          dnode_t *dn;
 332  346          int err;
 333  347  
 334  348          DB_DNODE_ENTER(db);
 335  349          dn = DB_DNODE(db);
 336  350          err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 337  351          DB_DNODE_EXIT(db);
 338  352  
 339  353          return (err);
 340  354  }
 341  355  
 342  356  /*
 343  357   * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
 344  358   * to take a held dnode rather than <os, object> -- the lookup is wasteful,
 345  359   * and can induce severe lock contention when writing to several files
 346  360   * whose dnodes are in the same block.
 347  361   */
 348  362  static int
 349  363  dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
 350  364      int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 351  365  {
 352  366          dsl_pool_t *dp = NULL;
 353  367          dmu_buf_t **dbp;
 354  368          uint64_t blkid, nblks, i;
 355  369          uint32_t dbuf_flags;
 356  370          int err;
 357  371          zio_t *zio;
 358  372          hrtime_t start;
 359  373  
 360  374          ASSERT(length <= DMU_MAX_ACCESS);
 361  375  
 362  376          dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
 363  377          if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
 364  378                  dbuf_flags |= DB_RF_NOPREFETCH;
 365  379  
 366  380          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 367  381          if (dn->dn_datablkshift) {
 368  382                  int blkshift = dn->dn_datablkshift;
 369  383                  nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
 370  384                      P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
 371  385          } else {
 372  386                  if (offset + length > dn->dn_datablksz) {
 373  387                          zfs_panic_recover("zfs: accessing past end of object "
 374  388                              "%llx/%llx (size=%u access=%llu+%llu)",
 375  389                              (longlong_t)dn->dn_objset->
 376  390                              os_dsl_dataset->ds_object,
 377  391                              (longlong_t)dn->dn_object, dn->dn_datablksz,
 378  392                              (longlong_t)offset, (longlong_t)length);
 379  393                          rw_exit(&dn->dn_struct_rwlock);
 380  394                          return (EIO);
 381  395                  }
 382  396                  nblks = 1;
 383  397          }
 384  398          dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 385  399  
 386  400          if (dn->dn_objset->os_dsl_dataset)
 387  401                  dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
 388  402          if (dp && dsl_pool_sync_context(dp))
 389  403                  start = gethrtime();
 390  404          zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 391  405          blkid = dbuf_whichblock(dn, offset);
 392  406          for (i = 0; i < nblks; i++) {
 393  407                  dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
 394  408                  if (db == NULL) {
 395  409                          rw_exit(&dn->dn_struct_rwlock);
 396  410                          dmu_buf_rele_array(dbp, nblks, tag);
 397  411                          zio_nowait(zio);
 398  412                          return (EIO);
 399  413                  }
 400  414                  /* initiate async i/o */
 401  415                  if (read) {
 402  416                          (void) dbuf_read(db, zio, dbuf_flags);
 403  417                  }
 404  418                  dbp[i] = &db->db;
 405  419          }
 406  420          rw_exit(&dn->dn_struct_rwlock);
 407  421  
 408  422          /* wait for async i/o */
 409  423          err = zio_wait(zio);
 410  424          /* track read overhead when we are in sync context */
 411  425          if (dp && dsl_pool_sync_context(dp))
 412  426                  dp->dp_read_overhead += gethrtime() - start;
 413  427          if (err) {
 414  428                  dmu_buf_rele_array(dbp, nblks, tag);
 415  429                  return (err);
 416  430          }
 417  431  
 418  432          /* wait for other io to complete */
 419  433          if (read) {
 420  434                  for (i = 0; i < nblks; i++) {
 421  435                          dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 422  436                          mutex_enter(&db->db_mtx);
 423  437                          while (db->db_state == DB_READ ||
 424  438                              db->db_state == DB_FILL)
 425  439                                  cv_wait(&db->db_changed, &db->db_mtx);
 426  440                          if (db->db_state == DB_UNCACHED)
 427  441                                  err = EIO;
 428  442                          mutex_exit(&db->db_mtx);
 429  443                          if (err) {
 430  444                                  dmu_buf_rele_array(dbp, nblks, tag);
 431  445                                  return (err);
 432  446                          }
 433  447                  }
 434  448          }
 435  449  
 436  450          *numbufsp = nblks;
 437  451          *dbpp = dbp;
 438  452          return (0);
 439  453  }
 440  454  
 441  455  static int
 442  456  dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 443  457      uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 444  458  {
 445  459          dnode_t *dn;
 446  460          int err;
 447  461  
 448  462          err = dnode_hold(os, object, FTAG, &dn);
 449  463          if (err)
 450  464                  return (err);
 451  465  
 452  466          err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 453  467              numbufsp, dbpp, DMU_READ_PREFETCH);
 454  468  
 455  469          dnode_rele(dn, FTAG);
 456  470  
 457  471          return (err);
 458  472  }
 459  473  
 460  474  int
 461  475  dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
 462  476      uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 463  477  {
 464  478          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 465  479          dnode_t *dn;
 466  480          int err;
 467  481  
 468  482          DB_DNODE_ENTER(db);
 469  483          dn = DB_DNODE(db);
 470  484          err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 471  485              numbufsp, dbpp, DMU_READ_PREFETCH);
 472  486          DB_DNODE_EXIT(db);
 473  487  
 474  488          return (err);
 475  489  }
 476  490  
 477  491  void
 478  492  dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 479  493  {
 480  494          int i;
 481  495          dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 482  496  
 483  497          if (numbufs == 0)
 484  498                  return;
 485  499  
 486  500          for (i = 0; i < numbufs; i++) {
 487  501                  if (dbp[i])
 488  502                          dbuf_rele(dbp[i], tag);
 489  503          }
 490  504  
 491  505          kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 492  506  }
 493  507  
 494  508  void
 495  509  dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 496  510  {
 497  511          dnode_t *dn;
 498  512          uint64_t blkid;
 499  513          int nblks, i, err;
 500  514  
 501  515          if (zfs_prefetch_disable)
 502  516                  return;
 503  517  
 504  518          if (len == 0) {  /* they're interested in the bonus buffer */
 505  519                  dn = DMU_META_DNODE(os);
 506  520  
 507  521                  if (object == 0 || object >= DN_MAX_OBJECT)
 508  522                          return;
 509  523  
 510  524                  rw_enter(&dn->dn_struct_rwlock, RW_READER);
 511  525                  blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
 512  526                  dbuf_prefetch(dn, blkid);
 513  527                  rw_exit(&dn->dn_struct_rwlock);
 514  528                  return;
 515  529          }
 516  530  
 517  531          /*
 518  532           * XXX - Note, if the dnode for the requested object is not
 519  533           * already cached, we will do a *synchronous* read in the
 520  534           * dnode_hold() call.  The same is true for any indirects.
 521  535           */
 522  536          err = dnode_hold(os, object, FTAG, &dn);
 523  537          if (err != 0)
 524  538                  return;
 525  539  
 526  540          rw_enter(&dn->dn_struct_rwlock, RW_READER);
 527  541          if (dn->dn_datablkshift) {
 528  542                  int blkshift = dn->dn_datablkshift;
 529  543                  nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
 530  544                      P2ALIGN(offset, 1<<blkshift)) >> blkshift;
 531  545          } else {
 532  546                  nblks = (offset < dn->dn_datablksz);
 533  547          }
 534  548  
 535  549          if (nblks != 0) {
 536  550                  blkid = dbuf_whichblock(dn, offset);
 537  551                  for (i = 0; i < nblks; i++)
 538  552                          dbuf_prefetch(dn, blkid+i);
 539  553          }
 540  554  
 541  555          rw_exit(&dn->dn_struct_rwlock);
 542  556  
 543  557          dnode_rele(dn, FTAG);
 544  558  }
 545  559  
 546  560  /*
 547  561   * Get the next "chunk" of file data to free.  We traverse the file from
 548  562   * the end so that the file gets shorter over time (if we crashes in the
 549  563   * middle, this will leave us in a better state).  We find allocated file
 550  564   * data by simply searching the allocated level 1 indirects.
 551  565   */
 552  566  static int
 553  567  get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
 554  568  {
 555  569          uint64_t len = *start - limit;
 556  570          uint64_t blkcnt = 0;
 557  571          uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
 558  572          uint64_t iblkrange =
 559  573              dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 560  574  
 561  575          ASSERT(limit <= *start);
 562  576  
 563  577          if (len <= iblkrange * maxblks) {
 564  578                  *start = limit;
 565  579                  return (0);
 566  580          }
 567  581          ASSERT(ISP2(iblkrange));
 568  582  
 569  583          while (*start > limit && blkcnt < maxblks) {
 570  584                  int err;
 571  585  
 572  586                  /* find next allocated L1 indirect */
 573  587                  err = dnode_next_offset(dn,
 574  588                      DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 575  589  
 576  590                  /* if there are no more, then we are done */
 577  591                  if (err == ESRCH) {
 578  592                          *start = limit;
 579  593                          return (0);
 580  594                  } else if (err) {
 581  595                          return (err);
 582  596                  }
 583  597                  blkcnt += 1;
 584  598  
 585  599                  /* reset offset to end of "next" block back */
 586  600                  *start = P2ALIGN(*start, iblkrange);
 587  601                  if (*start <= limit)
 588  602                          *start = limit;
 589  603                  else
 590  604                          *start -= 1;
 591  605          }
 592  606          return (0);
 593  607  }
 594  608  
 595  609  static int
 596  610  dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 597  611      uint64_t length, boolean_t free_dnode)
 598  612  {
 599  613          dmu_tx_t *tx;
 600  614          uint64_t object_size, start, end, len;
 601  615          boolean_t trunc = (length == DMU_OBJECT_END);
 602  616          int align, err;
 603  617  
 604  618          align = 1 << dn->dn_datablkshift;
 605  619          ASSERT(align > 0);
 606  620          object_size = align == 1 ? dn->dn_datablksz :
 607  621              (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
 608  622  
 609  623          end = offset + length;
 610  624          if (trunc || end > object_size)
 611  625                  end = object_size;
 612  626          if (end <= offset)
 613  627                  return (0);
 614  628          length = end - offset;
 615  629  
 616  630          while (length) {
 617  631                  start = end;
 618  632                  /* assert(offset <= start) */
 619  633                  err = get_next_chunk(dn, &start, offset);
 620  634                  if (err)
 621  635                          return (err);
 622  636                  len = trunc ? DMU_OBJECT_END : end - start;
 623  637  
 624  638                  tx = dmu_tx_create(os);
 625  639                  dmu_tx_hold_free(tx, dn->dn_object, start, len);
 626  640                  err = dmu_tx_assign(tx, TXG_WAIT);
 627  641                  if (err) {
 628  642                          dmu_tx_abort(tx);
 629  643                          return (err);
 630  644                  }
 631  645  
 632  646                  dnode_free_range(dn, start, trunc ? -1 : len, tx);
 633  647  
 634  648                  if (start == 0 && free_dnode) {
 635  649                          ASSERT(trunc);
 636  650                          dnode_free(dn, tx);
 637  651                  }
 638  652  
 639  653                  length -= end - start;
 640  654  
 641  655                  dmu_tx_commit(tx);
 642  656                  end = start;
 643  657          }
 644  658          return (0);
 645  659  }
 646  660  
 647  661  int
 648  662  dmu_free_long_range(objset_t *os, uint64_t object,
 649  663      uint64_t offset, uint64_t length)
 650  664  {
 651  665          dnode_t *dn;
 652  666          int err;
 653  667  
 654  668          err = dnode_hold(os, object, FTAG, &dn);
 655  669          if (err != 0)
 656  670                  return (err);
 657  671          err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
 658  672          dnode_rele(dn, FTAG);
 659  673          return (err);
 660  674  }
 661  675  
 662  676  int
 663  677  dmu_free_object(objset_t *os, uint64_t object)
 664  678  {
 665  679          dnode_t *dn;
 666  680          dmu_tx_t *tx;
 667  681          int err;
 668  682  
 669  683          err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 670  684              FTAG, &dn);
 671  685          if (err != 0)
 672  686                  return (err);
 673  687          if (dn->dn_nlevels == 1) {
 674  688                  tx = dmu_tx_create(os);
 675  689                  dmu_tx_hold_bonus(tx, object);
 676  690                  dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
 677  691                  err = dmu_tx_assign(tx, TXG_WAIT);
 678  692                  if (err == 0) {
 679  693                          dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
 680  694                          dnode_free(dn, tx);
 681  695                          dmu_tx_commit(tx);
 682  696                  } else {
 683  697                          dmu_tx_abort(tx);
 684  698                  }
 685  699          } else {
 686  700                  err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
 687  701          }
 688  702          dnode_rele(dn, FTAG);
 689  703          return (err);
 690  704  }
 691  705  
 692  706  int
 693  707  dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 694  708      uint64_t size, dmu_tx_t *tx)
 695  709  {
 696  710          dnode_t *dn;
 697  711          int err = dnode_hold(os, object, FTAG, &dn);
 698  712          if (err)
 699  713                  return (err);
 700  714          ASSERT(offset < UINT64_MAX);
 701  715          ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
 702  716          dnode_free_range(dn, offset, size, tx);
 703  717          dnode_rele(dn, FTAG);
 704  718          return (0);
 705  719  }
 706  720  
 707  721  int
 708  722  dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 709  723      void *buf, uint32_t flags)
 710  724  {
 711  725          dnode_t *dn;
 712  726          dmu_buf_t **dbp;
 713  727          int numbufs, err;
 714  728  
 715  729          err = dnode_hold(os, object, FTAG, &dn);
 716  730          if (err)
 717  731                  return (err);
 718  732  
 719  733          /*
 720  734           * Deal with odd block sizes, where there can't be data past the first
 721  735           * block.  If we ever do the tail block optimization, we will need to
 722  736           * handle that here as well.
 723  737           */
 724  738          if (dn->dn_maxblkid == 0) {
 725  739                  int newsz = offset > dn->dn_datablksz ? 0 :
 726  740                      MIN(size, dn->dn_datablksz - offset);
 727  741                  bzero((char *)buf + newsz, size - newsz);
 728  742                  size = newsz;
 729  743          }
 730  744  
 731  745          while (size > 0) {
 732  746                  uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 733  747                  int i;
 734  748  
 735  749                  /*
 736  750                   * NB: we could do this block-at-a-time, but it's nice
 737  751                   * to be reading in parallel.
 738  752                   */
 739  753                  err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 740  754                      TRUE, FTAG, &numbufs, &dbp, flags);
 741  755                  if (err)
 742  756                          break;
 743  757  
 744  758                  for (i = 0; i < numbufs; i++) {
 745  759                          int tocpy;
 746  760                          int bufoff;
 747  761                          dmu_buf_t *db = dbp[i];
 748  762  
 749  763                          ASSERT(size > 0);
 750  764  
 751  765                          bufoff = offset - db->db_offset;
 752  766                          tocpy = (int)MIN(db->db_size - bufoff, size);
 753  767  
 754  768                          bcopy((char *)db->db_data + bufoff, buf, tocpy);
 755  769  
 756  770                          offset += tocpy;
 757  771                          size -= tocpy;
 758  772                          buf = (char *)buf + tocpy;
 759  773                  }
 760  774                  dmu_buf_rele_array(dbp, numbufs, FTAG);
 761  775          }
 762  776          dnode_rele(dn, FTAG);
 763  777          return (err);
 764  778  }
 765  779  
 766  780  void
 767  781  dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 768  782      const void *buf, dmu_tx_t *tx)
 769  783  {
 770  784          dmu_buf_t **dbp;
 771  785          int numbufs, i;
 772  786  
 773  787          if (size == 0)
 774  788                  return;
 775  789  
 776  790          VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 777  791              FALSE, FTAG, &numbufs, &dbp));
 778  792  
 779  793          for (i = 0; i < numbufs; i++) {
 780  794                  int tocpy;
 781  795                  int bufoff;
 782  796                  dmu_buf_t *db = dbp[i];
 783  797  
 784  798                  ASSERT(size > 0);
 785  799  
 786  800                  bufoff = offset - db->db_offset;
 787  801                  tocpy = (int)MIN(db->db_size - bufoff, size);
 788  802  
 789  803                  ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 790  804  
 791  805                  if (tocpy == db->db_size)
 792  806                          dmu_buf_will_fill(db, tx);
 793  807                  else
 794  808                          dmu_buf_will_dirty(db, tx);
 795  809  
 796  810                  bcopy(buf, (char *)db->db_data + bufoff, tocpy);
 797  811  
 798  812                  if (tocpy == db->db_size)
 799  813                          dmu_buf_fill_done(db, tx);
 800  814  
 801  815                  offset += tocpy;
 802  816                  size -= tocpy;
 803  817                  buf = (char *)buf + tocpy;
 804  818          }
 805  819          dmu_buf_rele_array(dbp, numbufs, FTAG);
 806  820  }
 807  821  
 808  822  void
 809  823  dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 810  824      dmu_tx_t *tx)
 811  825  {
 812  826          dmu_buf_t **dbp;
 813  827          int numbufs, i;
 814  828  
 815  829          if (size == 0)
 816  830                  return;
 817  831  
 818  832          VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 819  833              FALSE, FTAG, &numbufs, &dbp));
 820  834  
 821  835          for (i = 0; i < numbufs; i++) {
 822  836                  dmu_buf_t *db = dbp[i];
 823  837  
 824  838                  dmu_buf_will_not_fill(db, tx);
 825  839          }
 826  840          dmu_buf_rele_array(dbp, numbufs, FTAG);
 827  841  }
 828  842  
 829  843  /*
 830  844   * DMU support for xuio
 831  845   */
 832  846  kstat_t *xuio_ksp = NULL;
 833  847  
 834  848  int
 835  849  dmu_xuio_init(xuio_t *xuio, int nblk)
 836  850  {
 837  851          dmu_xuio_t *priv;
 838  852          uio_t *uio = &xuio->xu_uio;
 839  853  
 840  854          uio->uio_iovcnt = nblk;
 841  855          uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
 842  856  
 843  857          priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
 844  858          priv->cnt = nblk;
 845  859          priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
 846  860          priv->iovp = uio->uio_iov;
 847  861          XUIO_XUZC_PRIV(xuio) = priv;
 848  862  
 849  863          if (XUIO_XUZC_RW(xuio) == UIO_READ)
 850  864                  XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
 851  865          else
 852  866                  XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
 853  867  
 854  868          return (0);
 855  869  }
 856  870  
 857  871  void
 858  872  dmu_xuio_fini(xuio_t *xuio)
 859  873  {
 860  874          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 861  875          int nblk = priv->cnt;
 862  876  
 863  877          kmem_free(priv->iovp, nblk * sizeof (iovec_t));
 864  878          kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
 865  879          kmem_free(priv, sizeof (dmu_xuio_t));
 866  880  
 867  881          if (XUIO_XUZC_RW(xuio) == UIO_READ)
 868  882                  XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
 869  883          else
 870  884                  XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
 871  885  }
 872  886  
 873  887  /*
 874  888   * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
 875  889   * and increase priv->next by 1.
 876  890   */
 877  891  int
 878  892  dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
 879  893  {
 880  894          struct iovec *iov;
 881  895          uio_t *uio = &xuio->xu_uio;
 882  896          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 883  897          int i = priv->next++;
 884  898  
 885  899          ASSERT(i < priv->cnt);
 886  900          ASSERT(off + n <= arc_buf_size(abuf));
 887  901          iov = uio->uio_iov + i;
 888  902          iov->iov_base = (char *)abuf->b_data + off;
 889  903          iov->iov_len = n;
 890  904          priv->bufs[i] = abuf;
 891  905          return (0);
 892  906  }
 893  907  
 894  908  int
 895  909  dmu_xuio_cnt(xuio_t *xuio)
 896  910  {
 897  911          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 898  912          return (priv->cnt);
 899  913  }
 900  914  
 901  915  arc_buf_t *
 902  916  dmu_xuio_arcbuf(xuio_t *xuio, int i)
 903  917  {
 904  918          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 905  919  
 906  920          ASSERT(i < priv->cnt);
 907  921          return (priv->bufs[i]);
 908  922  }
 909  923  
 910  924  void
 911  925  dmu_xuio_clear(xuio_t *xuio, int i)
 912  926  {
 913  927          dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 914  928  
 915  929          ASSERT(i < priv->cnt);
 916  930          priv->bufs[i] = NULL;
 917  931  }
 918  932  
 919  933  static void
 920  934  xuio_stat_init(void)
 921  935  {
 922  936          xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
 923  937              KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
 924  938              KSTAT_FLAG_VIRTUAL);
 925  939          if (xuio_ksp != NULL) {
 926  940                  xuio_ksp->ks_data = &xuio_stats;
 927  941                  kstat_install(xuio_ksp);
 928  942          }
 929  943  }
 930  944  
 931  945  static void
 932  946  xuio_stat_fini(void)
 933  947  {
 934  948          if (xuio_ksp != NULL) {
 935  949                  kstat_delete(xuio_ksp);
 936  950                  xuio_ksp = NULL;
 937  951          }
 938  952  }
 939  953  
 940  954  void
 941  955  xuio_stat_wbuf_copied()
 942  956  {
 943  957          XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 944  958  }
 945  959  
 946  960  void
 947  961  xuio_stat_wbuf_nocopy()
 948  962  {
 949  963          XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
 950  964  }
 951  965  
 952  966  #ifdef _KERNEL
 953  967  int
 954  968  dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 955  969  {
 956  970          dmu_buf_t **dbp;
 957  971          int numbufs, i, err;
 958  972          xuio_t *xuio = NULL;
 959  973  
 960  974          /*
 961  975           * NB: we could do this block-at-a-time, but it's nice
 962  976           * to be reading in parallel.
 963  977           */
 964  978          err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
 965  979              &numbufs, &dbp);
 966  980          if (err)
 967  981                  return (err);
 968  982  
 969  983          if (uio->uio_extflg == UIO_XUIO)
 970  984                  xuio = (xuio_t *)uio;
 971  985  
 972  986          for (i = 0; i < numbufs; i++) {
 973  987                  int tocpy;
 974  988                  int bufoff;
 975  989                  dmu_buf_t *db = dbp[i];
 976  990  
 977  991                  ASSERT(size > 0);
 978  992  
 979  993                  bufoff = uio->uio_loffset - db->db_offset;
 980  994                  tocpy = (int)MIN(db->db_size - bufoff, size);
 981  995  
 982  996                  if (xuio) {
 983  997                          dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 984  998                          arc_buf_t *dbuf_abuf = dbi->db_buf;
 985  999                          arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
 986 1000                          err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
 987 1001                          if (!err) {
 988 1002                                  uio->uio_resid -= tocpy;
 989 1003                                  uio->uio_loffset += tocpy;
 990 1004                          }
 991 1005  
 992 1006                          if (abuf == dbuf_abuf)
 993 1007                                  XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 994 1008                          else
 995 1009                                  XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 996 1010                  } else {
 997 1011                          err = uiomove((char *)db->db_data + bufoff, tocpy,
 998 1012                              UIO_READ, uio);
 999 1013                  }
1000 1014                  if (err)
1001 1015                          break;
1002 1016  
1003 1017                  size -= tocpy;
1004 1018          }
1005 1019          dmu_buf_rele_array(dbp, numbufs, FTAG);
1006 1020  
1007 1021          return (err);
1008 1022  }
1009 1023  
1010 1024  static int
1011 1025  dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1012 1026  {
1013 1027          dmu_buf_t **dbp;
1014 1028          int numbufs;
1015 1029          int err = 0;
1016 1030          int i;
1017 1031  
1018 1032          err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1019 1033              FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1020 1034          if (err)
1021 1035                  return (err);
1022 1036  
1023 1037          for (i = 0; i < numbufs; i++) {
1024 1038                  int tocpy;
1025 1039                  int bufoff;
1026 1040                  dmu_buf_t *db = dbp[i];
1027 1041  
1028 1042                  ASSERT(size > 0);
1029 1043  
1030 1044                  bufoff = uio->uio_loffset - db->db_offset;
1031 1045                  tocpy = (int)MIN(db->db_size - bufoff, size);
1032 1046  
1033 1047                  ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1034 1048  
1035 1049                  if (tocpy == db->db_size)
1036 1050                          dmu_buf_will_fill(db, tx);
1037 1051                  else
1038 1052                          dmu_buf_will_dirty(db, tx);
1039 1053  
1040 1054                  /*
1041 1055                   * XXX uiomove could block forever (eg. nfs-backed
1042 1056                   * pages).  There needs to be a uiolockdown() function
1043 1057                   * to lock the pages in memory, so that uiomove won't
1044 1058                   * block.
1045 1059                   */
1046 1060                  err = uiomove((char *)db->db_data + bufoff, tocpy,
1047 1061                      UIO_WRITE, uio);
1048 1062  
1049 1063                  if (tocpy == db->db_size)
1050 1064                          dmu_buf_fill_done(db, tx);
1051 1065  
1052 1066                  if (err)
1053 1067                          break;
1054 1068  
1055 1069                  size -= tocpy;
1056 1070          }
1057 1071  
1058 1072          dmu_buf_rele_array(dbp, numbufs, FTAG);
1059 1073          return (err);
1060 1074  }
1061 1075  
1062 1076  int
1063 1077  dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1064 1078      dmu_tx_t *tx)
1065 1079  {
1066 1080          dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1067 1081          dnode_t *dn;
1068 1082          int err;
1069 1083  
1070 1084          if (size == 0)
1071 1085                  return (0);
1072 1086  
1073 1087          DB_DNODE_ENTER(db);
1074 1088          dn = DB_DNODE(db);
1075 1089          err = dmu_write_uio_dnode(dn, uio, size, tx);
1076 1090          DB_DNODE_EXIT(db);
1077 1091  
1078 1092          return (err);
1079 1093  }
1080 1094  
1081 1095  int
1082 1096  dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1083 1097      dmu_tx_t *tx)
1084 1098  {
1085 1099          dnode_t *dn;
1086 1100          int err;
1087 1101  
1088 1102          if (size == 0)
1089 1103                  return (0);
1090 1104  
1091 1105          err = dnode_hold(os, object, FTAG, &dn);
1092 1106          if (err)
1093 1107                  return (err);
1094 1108  
1095 1109          err = dmu_write_uio_dnode(dn, uio, size, tx);
1096 1110  
1097 1111          dnode_rele(dn, FTAG);
1098 1112  
1099 1113          return (err);
1100 1114  }
1101 1115  
1102 1116  int
1103 1117  dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1104 1118      page_t *pp, dmu_tx_t *tx)
1105 1119  {
1106 1120          dmu_buf_t **dbp;
1107 1121          int numbufs, i;
1108 1122          int err;
1109 1123  
1110 1124          if (size == 0)
1111 1125                  return (0);
1112 1126  
1113 1127          err = dmu_buf_hold_array(os, object, offset, size,
1114 1128              FALSE, FTAG, &numbufs, &dbp);
1115 1129          if (err)
1116 1130                  return (err);
1117 1131  
1118 1132          for (i = 0; i < numbufs; i++) {
1119 1133                  int tocpy, copied, thiscpy;
1120 1134                  int bufoff;
1121 1135                  dmu_buf_t *db = dbp[i];
1122 1136                  caddr_t va;
1123 1137  
1124 1138                  ASSERT(size > 0);
1125 1139                  ASSERT3U(db->db_size, >=, PAGESIZE);
1126 1140  
1127 1141                  bufoff = offset - db->db_offset;
1128 1142                  tocpy = (int)MIN(db->db_size - bufoff, size);
1129 1143  
1130 1144                  ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1131 1145  
1132 1146                  if (tocpy == db->db_size)
1133 1147                          dmu_buf_will_fill(db, tx);
1134 1148                  else
1135 1149                          dmu_buf_will_dirty(db, tx);
1136 1150  
1137 1151                  for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1138 1152                          ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1139 1153                          thiscpy = MIN(PAGESIZE, tocpy - copied);
1140 1154                          va = zfs_map_page(pp, S_READ);
1141 1155                          bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1142 1156                          zfs_unmap_page(pp, va);
1143 1157                          pp = pp->p_next;
1144 1158                          bufoff += PAGESIZE;
1145 1159                  }
1146 1160  
1147 1161                  if (tocpy == db->db_size)
1148 1162                          dmu_buf_fill_done(db, tx);
1149 1163  
1150 1164                  offset += tocpy;
1151 1165                  size -= tocpy;
1152 1166          }
1153 1167          dmu_buf_rele_array(dbp, numbufs, FTAG);
1154 1168          return (err);
1155 1169  }
1156 1170  #endif
1157 1171  
1158 1172  /*
1159 1173   * Allocate a loaned anonymous arc buffer.
1160 1174   */
1161 1175  arc_buf_t *
1162 1176  dmu_request_arcbuf(dmu_buf_t *handle, int size)
1163 1177  {
1164 1178          dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1165 1179          spa_t *spa;
1166 1180  
1167 1181          DB_GET_SPA(&spa, db);
1168 1182          return (arc_loan_buf(spa, size));
1169 1183  }
1170 1184  
1171 1185  /*
1172 1186   * Free a loaned arc buffer.
1173 1187   */
1174 1188  void
1175 1189  dmu_return_arcbuf(arc_buf_t *buf)
1176 1190  {
1177 1191          arc_return_buf(buf, FTAG);
1178 1192          VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
1179 1193  }
1180 1194  
1181 1195  /*
1182 1196   * When possible directly assign passed loaned arc buffer to a dbuf.
1183 1197   * If this is not possible copy the contents of passed arc buf via
1184 1198   * dmu_write().
1185 1199   */
1186 1200  void
1187 1201  dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1188 1202      dmu_tx_t *tx)
1189 1203  {
1190 1204          dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1191 1205          dnode_t *dn;
1192 1206          dmu_buf_impl_t *db;
1193 1207          uint32_t blksz = (uint32_t)arc_buf_size(buf);
1194 1208          uint64_t blkid;
1195 1209  
1196 1210          DB_DNODE_ENTER(dbuf);
1197 1211          dn = DB_DNODE(dbuf);
1198 1212          rw_enter(&dn->dn_struct_rwlock, RW_READER);
1199 1213          blkid = dbuf_whichblock(dn, offset);
1200 1214          VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1201 1215          rw_exit(&dn->dn_struct_rwlock);
1202 1216          DB_DNODE_EXIT(dbuf);
1203 1217  
1204 1218          if (offset == db->db.db_offset && blksz == db->db.db_size) {
1205 1219                  dbuf_assign_arcbuf(db, buf, tx);
1206 1220                  dbuf_rele(db, FTAG);
1207 1221          } else {
1208 1222                  objset_t *os;
1209 1223                  uint64_t object;
1210 1224  
1211 1225                  DB_DNODE_ENTER(dbuf);
1212 1226                  dn = DB_DNODE(dbuf);
1213 1227                  os = dn->dn_objset;
1214 1228                  object = dn->dn_object;
1215 1229                  DB_DNODE_EXIT(dbuf);
1216 1230  
1217 1231                  dbuf_rele(db, FTAG);
1218 1232                  dmu_write(os, object, offset, blksz, buf->b_data, tx);
1219 1233                  dmu_return_arcbuf(buf);
1220 1234                  XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1221 1235          }
1222 1236  }
1223 1237  
1224 1238  typedef struct {
1225 1239          dbuf_dirty_record_t     *dsa_dr;
1226 1240          dmu_sync_cb_t           *dsa_done;
1227 1241          zgd_t                   *dsa_zgd;
1228 1242          dmu_tx_t                *dsa_tx;
1229 1243  } dmu_sync_arg_t;
1230 1244  
1231 1245  /* ARGSUSED */
1232 1246  static void
1233 1247  dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1234 1248  {
1235 1249          dmu_sync_arg_t *dsa = varg;
1236 1250          dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1237 1251          blkptr_t *bp = zio->io_bp;
1238 1252  
1239 1253          if (zio->io_error == 0) {
1240 1254                  if (BP_IS_HOLE(bp)) {
1241 1255                          /*
1242 1256                           * A block of zeros may compress to a hole, but the
1243 1257                           * block size still needs to be known for replay.
1244 1258                           */
1245 1259                          BP_SET_LSIZE(bp, db->db_size);
1246 1260                  } else {
1247 1261                          ASSERT(BP_GET_LEVEL(bp) == 0);
1248 1262                          bp->blk_fill = 1;
1249 1263                  }
1250 1264          }
1251 1265  }
1252 1266  
1253 1267  static void
1254 1268  dmu_sync_late_arrival_ready(zio_t *zio)
1255 1269  {
1256 1270          dmu_sync_ready(zio, NULL, zio->io_private);
1257 1271  }
1258 1272  
1259 1273  /* ARGSUSED */
1260 1274  static void
1261 1275  dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1262 1276  {
1263 1277          dmu_sync_arg_t *dsa = varg;
1264 1278          dbuf_dirty_record_t *dr = dsa->dsa_dr;
1265 1279          dmu_buf_impl_t *db = dr->dr_dbuf;
1266 1280  
1267 1281          mutex_enter(&db->db_mtx);
1268 1282          ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1269 1283          if (zio->io_error == 0) {
1270 1284                  dr->dt.dl.dr_overridden_by = *zio->io_bp;
1271 1285                  dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1272 1286                  dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1273 1287                  if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
1274 1288                          BP_ZERO(&dr->dt.dl.dr_overridden_by);
1275 1289          } else {
1276 1290                  dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1277 1291          }
1278 1292          cv_broadcast(&db->db_changed);
1279 1293          mutex_exit(&db->db_mtx);
1280 1294  
1281 1295          dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1282 1296  
1283 1297          kmem_free(dsa, sizeof (*dsa));
1284 1298  }
1285 1299  
1286 1300  static void
1287 1301  dmu_sync_late_arrival_done(zio_t *zio)
1288 1302  {
1289 1303          blkptr_t *bp = zio->io_bp;
1290 1304          dmu_sync_arg_t *dsa = zio->io_private;
1291 1305  
1292 1306          if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1293 1307                  ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1294 1308                  ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1295 1309                  zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1296 1310          }
1297 1311  
1298 1312          dmu_tx_commit(dsa->dsa_tx);
1299 1313  
1300 1314          dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1301 1315  
1302 1316          kmem_free(dsa, sizeof (*dsa));
1303 1317  }
1304 1318  
1305 1319  static int
1306 1320  dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1307 1321      zio_prop_t *zp, zbookmark_t *zb)
1308 1322  {
1309 1323          dmu_sync_arg_t *dsa;
1310 1324          dmu_tx_t *tx;
1311 1325  
1312 1326          tx = dmu_tx_create(os);
1313 1327          dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1314 1328          if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1315 1329                  dmu_tx_abort(tx);
1316 1330                  return (EIO);   /* Make zl_get_data do txg_waited_synced() */
1317 1331          }
1318 1332  
1319 1333          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1320 1334          dsa->dsa_dr = NULL;
1321 1335          dsa->dsa_done = done;
1322 1336          dsa->dsa_zgd = zgd;
1323 1337          dsa->dsa_tx = tx;
1324 1338  
1325 1339          zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1326 1340              zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1327 1341              dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
1328 1342              ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1329 1343  
1330 1344          return (0);
1331 1345  }
1332 1346  
1333 1347  /*
1334 1348   * Intent log support: sync the block associated with db to disk.
1335 1349   * N.B. and XXX: the caller is responsible for making sure that the
1336 1350   * data isn't changing while dmu_sync() is writing it.
1337 1351   *
1338 1352   * Return values:
1339 1353   *
1340 1354   *      EEXIST: this txg has already been synced, so there's nothing to to.
1341 1355   *              The caller should not log the write.
1342 1356   *
1343 1357   *      ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1344 1358   *              The caller should not log the write.
1345 1359   *
1346 1360   *      EALREADY: this block is already in the process of being synced.
1347 1361   *              The caller should track its progress (somehow).
1348 1362   *
1349 1363   *      EIO: could not do the I/O.
1350 1364   *              The caller should do a txg_wait_synced().
1351 1365   *
1352 1366   *      0: the I/O has been initiated.
1353 1367   *              The caller should log this blkptr in the done callback.
1354 1368   *              It is possible that the I/O will fail, in which case
1355 1369   *              the error will be reported to the done callback and
1356 1370   *              propagated to pio from zio_done().
1357 1371   */
1358 1372  int
1359 1373  dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1360 1374  {
1361 1375          blkptr_t *bp = zgd->zgd_bp;
1362 1376          dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1363 1377          objset_t *os = db->db_objset;
1364 1378          dsl_dataset_t *ds = os->os_dsl_dataset;
1365 1379          dbuf_dirty_record_t *dr;
1366 1380          dmu_sync_arg_t *dsa;
1367 1381          zbookmark_t zb;
1368 1382          zio_prop_t zp;
1369 1383          dnode_t *dn;
1370 1384  
1371 1385          ASSERT(pio != NULL);
1372 1386          ASSERT(BP_IS_HOLE(bp));
1373 1387          ASSERT(txg != 0);
1374 1388  
1375 1389          SET_BOOKMARK(&zb, ds->ds_object,
1376 1390              db->db.db_object, db->db_level, db->db_blkid);
1377 1391  
1378 1392          DB_DNODE_ENTER(db);
1379 1393          dn = DB_DNODE(db);
1380 1394          dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
1381 1395          DB_DNODE_EXIT(db);
1382 1396  
1383 1397          /*
1384 1398           * If we're frozen (running ziltest), we always need to generate a bp.
1385 1399           */
1386 1400          if (txg > spa_freeze_txg(os->os_spa))
1387 1401                  return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1388 1402  
1389 1403          /*
1390 1404           * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1391 1405           * and us.  If we determine that this txg is not yet syncing,
1392 1406           * but it begins to sync a moment later, that's OK because the
1393 1407           * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1394 1408           */
1395 1409          mutex_enter(&db->db_mtx);
1396 1410  
1397 1411          if (txg <= spa_last_synced_txg(os->os_spa)) {
1398 1412                  /*
1399 1413                   * This txg has already synced.  There's nothing to do.
1400 1414                   */
1401 1415                  mutex_exit(&db->db_mtx);
1402 1416                  return (EEXIST);
1403 1417          }
1404 1418  
1405 1419          if (txg <= spa_syncing_txg(os->os_spa)) {
1406 1420                  /*
1407 1421                   * This txg is currently syncing, so we can't mess with
1408 1422                   * the dirty record anymore; just write a new log block.
1409 1423                   */
1410 1424                  mutex_exit(&db->db_mtx);
1411 1425                  return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1412 1426          }
1413 1427  
1414 1428          dr = db->db_last_dirty;
1415 1429          while (dr && dr->dr_txg != txg)
1416 1430                  dr = dr->dr_next;
1417 1431  
1418 1432          if (dr == NULL) {
1419 1433                  /*
1420 1434                   * There's no dr for this dbuf, so it must have been freed.
1421 1435                   * There's no need to log writes to freed blocks, so we're done.
1422 1436                   */
1423 1437                  mutex_exit(&db->db_mtx);
1424 1438                  return (ENOENT);
1425 1439          }
1426 1440  
1427 1441          ASSERT(dr->dr_txg == txg);
1428 1442          if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1429 1443              dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1430 1444                  /*
1431 1445                   * We have already issued a sync write for this buffer,
1432 1446                   * or this buffer has already been synced.  It could not
1433 1447                   * have been dirtied since, or we would have cleared the state.
1434 1448                   */
1435 1449                  mutex_exit(&db->db_mtx);
1436 1450                  return (EALREADY);
1437 1451          }
1438 1452  
1439 1453          ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1440 1454          dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1441 1455          mutex_exit(&db->db_mtx);
1442 1456  
1443 1457          dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1444 1458          dsa->dsa_dr = dr;
1445 1459          dsa->dsa_done = done;
1446 1460          dsa->dsa_zgd = zgd;
1447 1461          dsa->dsa_tx = NULL;
1448 1462  
1449 1463          zio_nowait(arc_write(pio, os->os_spa, txg,
1450 1464              bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
1451 1465              dmu_sync_ready, dmu_sync_done, dsa,
1452 1466              ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
1453 1467  
1454 1468          return (0);
1455 1469  }
1456 1470  
1457 1471  int
1458 1472  dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1459 1473          dmu_tx_t *tx)
1460 1474  {
1461 1475          dnode_t *dn;
1462 1476          int err;
1463 1477  
1464 1478          err = dnode_hold(os, object, FTAG, &dn);
1465 1479          if (err)
1466 1480                  return (err);
1467 1481          err = dnode_set_blksz(dn, size, ibs, tx);
1468 1482          dnode_rele(dn, FTAG);
1469 1483          return (err);
1470 1484  }
1471 1485  
1472 1486  void
1473 1487  dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1474 1488          dmu_tx_t *tx)
1475 1489  {
1476 1490          dnode_t *dn;
1477 1491  
1478 1492          /* XXX assumes dnode_hold will not get an i/o error */
1479 1493          (void) dnode_hold(os, object, FTAG, &dn);
1480 1494          ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
1481 1495          dn->dn_checksum = checksum;
1482 1496          dnode_setdirty(dn, tx);
1483 1497          dnode_rele(dn, FTAG);
1484 1498  }
1485 1499  
1486 1500  void
1487 1501  dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1488 1502          dmu_tx_t *tx)
1489 1503  {
1490 1504          dnode_t *dn;
1491 1505  
1492 1506          /* XXX assumes dnode_hold will not get an i/o error */
1493 1507          (void) dnode_hold(os, object, FTAG, &dn);
1494 1508          ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
1495 1509          dn->dn_compress = compress;
  
    | 
      ↓ open down ↓ | 
    1306 lines elided | 
    
      ↑ open up ↑ | 
  
1496 1510          dnode_setdirty(dn, tx);
1497 1511          dnode_rele(dn, FTAG);
1498 1512  }
1499 1513  
1500 1514  int zfs_mdcomp_disable = 0;
1501 1515  
1502 1516  void
1503 1517  dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1504 1518  {
1505 1519          dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1506      -        boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
     1520 +        boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1507 1521              (wp & WP_SPILL));
1508 1522          enum zio_checksum checksum = os->os_checksum;
1509 1523          enum zio_compress compress = os->os_compress;
1510 1524          enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1511 1525          boolean_t dedup;
1512 1526          boolean_t dedup_verify = os->os_dedup_verify;
1513 1527          int copies = os->os_copies;
1514 1528  
1515 1529          /*
1516 1530           * Determine checksum setting.
1517 1531           */
1518 1532          if (ismd) {
1519 1533                  /*
1520 1534                   * Metadata always gets checksummed.  If the data
1521 1535                   * checksum is multi-bit correctable, and it's not a
1522 1536                   * ZBT-style checksum, then it's suitable for metadata
1523 1537                   * as well.  Otherwise, the metadata checksum defaults
1524 1538                   * to fletcher4.
1525 1539                   */
1526 1540                  if (zio_checksum_table[checksum].ci_correctable < 1 ||
1527 1541                      zio_checksum_table[checksum].ci_eck)
1528 1542                          checksum = ZIO_CHECKSUM_FLETCHER_4;
1529 1543          } else {
1530 1544                  checksum = zio_checksum_select(dn->dn_checksum, checksum);
1531 1545          }
1532 1546  
1533 1547          /*
1534 1548           * Determine compression setting.
1535 1549           */
1536 1550          if (ismd) {
1537 1551                  /*
1538 1552                   * XXX -- we should design a compression algorithm
1539 1553                   * that specializes in arrays of bps.
1540 1554                   */
1541 1555                  compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
1542 1556                      ZIO_COMPRESS_LZJB;
1543 1557          } else {
1544 1558                  compress = zio_compress_select(dn->dn_compress, compress);
1545 1559          }
1546 1560  
1547 1561          /*
1548 1562           * Determine dedup setting.  If we are in dmu_sync(), we won't
1549 1563           * actually dedup now because that's all done in syncing context;
1550 1564           * but we do want to use the dedup checkum.  If the checksum is not
1551 1565           * strong enough to ensure unique signatures, force dedup_verify.
1552 1566           */
1553 1567          dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
1554 1568          if (dedup) {
1555 1569                  checksum = dedup_checksum;
1556 1570                  if (!zio_checksum_table[checksum].ci_dedup)
1557 1571                          dedup_verify = 1;
1558 1572          }
1559 1573  
1560 1574          if (wp & WP_DMU_SYNC)
1561 1575                  dedup = 0;
1562 1576  
1563 1577          if (wp & WP_NOFILL) {
1564 1578                  ASSERT(!ismd && level == 0);
1565 1579                  checksum = ZIO_CHECKSUM_OFF;
1566 1580                  compress = ZIO_COMPRESS_OFF;
1567 1581                  dedup = B_FALSE;
1568 1582          }
1569 1583  
1570 1584          zp->zp_checksum = checksum;
1571 1585          zp->zp_compress = compress;
1572 1586          zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
1573 1587          zp->zp_level = level;
1574 1588          zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
1575 1589          zp->zp_dedup = dedup;
1576 1590          zp->zp_dedup_verify = dedup && dedup_verify;
1577 1591  }
1578 1592  
1579 1593  int
1580 1594  dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1581 1595  {
1582 1596          dnode_t *dn;
1583 1597          int i, err;
1584 1598  
1585 1599          err = dnode_hold(os, object, FTAG, &dn);
1586 1600          if (err)
1587 1601                  return (err);
1588 1602          /*
1589 1603           * Sync any current changes before
1590 1604           * we go trundling through the block pointers.
1591 1605           */
1592 1606          for (i = 0; i < TXG_SIZE; i++) {
1593 1607                  if (list_link_active(&dn->dn_dirty_link[i]))
1594 1608                          break;
1595 1609          }
1596 1610          if (i != TXG_SIZE) {
1597 1611                  dnode_rele(dn, FTAG);
1598 1612                  txg_wait_synced(dmu_objset_pool(os), 0);
1599 1613                  err = dnode_hold(os, object, FTAG, &dn);
1600 1614                  if (err)
1601 1615                          return (err);
1602 1616          }
1603 1617  
1604 1618          err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
1605 1619          dnode_rele(dn, FTAG);
1606 1620  
1607 1621          return (err);
1608 1622  }
1609 1623  
1610 1624  void
1611 1625  dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
1612 1626  {
1613 1627          dnode_phys_t *dnp;
1614 1628  
1615 1629          rw_enter(&dn->dn_struct_rwlock, RW_READER);
1616 1630          mutex_enter(&dn->dn_mtx);
1617 1631  
1618 1632          dnp = dn->dn_phys;
1619 1633  
1620 1634          doi->doi_data_block_size = dn->dn_datablksz;
1621 1635          doi->doi_metadata_block_size = dn->dn_indblkshift ?
1622 1636              1ULL << dn->dn_indblkshift : 0;
1623 1637          doi->doi_type = dn->dn_type;
1624 1638          doi->doi_bonus_type = dn->dn_bonustype;
1625 1639          doi->doi_bonus_size = dn->dn_bonuslen;
1626 1640          doi->doi_indirection = dn->dn_nlevels;
1627 1641          doi->doi_checksum = dn->dn_checksum;
1628 1642          doi->doi_compress = dn->dn_compress;
1629 1643          doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
1630 1644          doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
1631 1645          doi->doi_fill_count = 0;
1632 1646          for (int i = 0; i < dnp->dn_nblkptr; i++)
1633 1647                  doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
1634 1648  
1635 1649          mutex_exit(&dn->dn_mtx);
1636 1650          rw_exit(&dn->dn_struct_rwlock);
1637 1651  }
1638 1652  
1639 1653  /*
1640 1654   * Get information on a DMU object.
1641 1655   * If doi is NULL, just indicates whether the object exists.
1642 1656   */
1643 1657  int
1644 1658  dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
1645 1659  {
1646 1660          dnode_t *dn;
1647 1661          int err = dnode_hold(os, object, FTAG, &dn);
1648 1662  
1649 1663          if (err)
1650 1664                  return (err);
1651 1665  
1652 1666          if (doi != NULL)
1653 1667                  dmu_object_info_from_dnode(dn, doi);
1654 1668  
1655 1669          dnode_rele(dn, FTAG);
1656 1670          return (0);
1657 1671  }
1658 1672  
1659 1673  /*
1660 1674   * As above, but faster; can be used when you have a held dbuf in hand.
1661 1675   */
1662 1676  void
1663 1677  dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
1664 1678  {
1665 1679          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1666 1680  
1667 1681          DB_DNODE_ENTER(db);
1668 1682          dmu_object_info_from_dnode(DB_DNODE(db), doi);
1669 1683          DB_DNODE_EXIT(db);
1670 1684  }
1671 1685  
1672 1686  /*
1673 1687   * Faster still when you only care about the size.
1674 1688   * This is specifically optimized for zfs_getattr().
1675 1689   */
1676 1690  void
1677 1691  dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
1678 1692      u_longlong_t *nblk512)
1679 1693  {
1680 1694          dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1681 1695          dnode_t *dn;
1682 1696  
1683 1697          DB_DNODE_ENTER(db);
1684 1698          dn = DB_DNODE(db);
1685 1699  
1686 1700          *blksize = dn->dn_datablksz;
1687 1701          /* add 1 for dnode space */
1688 1702          *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
1689 1703              SPA_MINBLOCKSHIFT) + 1;
1690 1704          DB_DNODE_EXIT(db);
1691 1705  }
1692 1706  
1693 1707  void
1694 1708  byteswap_uint64_array(void *vbuf, size_t size)
1695 1709  {
1696 1710          uint64_t *buf = vbuf;
1697 1711          size_t count = size >> 3;
1698 1712          int i;
1699 1713  
1700 1714          ASSERT((size & 7) == 0);
1701 1715  
1702 1716          for (i = 0; i < count; i++)
1703 1717                  buf[i] = BSWAP_64(buf[i]);
1704 1718  }
1705 1719  
1706 1720  void
1707 1721  byteswap_uint32_array(void *vbuf, size_t size)
1708 1722  {
1709 1723          uint32_t *buf = vbuf;
1710 1724          size_t count = size >> 2;
1711 1725          int i;
1712 1726  
1713 1727          ASSERT((size & 3) == 0);
1714 1728  
1715 1729          for (i = 0; i < count; i++)
1716 1730                  buf[i] = BSWAP_32(buf[i]);
1717 1731  }
1718 1732  
1719 1733  void
1720 1734  byteswap_uint16_array(void *vbuf, size_t size)
1721 1735  {
1722 1736          uint16_t *buf = vbuf;
1723 1737          size_t count = size >> 1;
1724 1738          int i;
1725 1739  
1726 1740          ASSERT((size & 1) == 0);
1727 1741  
1728 1742          for (i = 0; i < count; i++)
1729 1743                  buf[i] = BSWAP_16(buf[i]);
1730 1744  }
1731 1745  
1732 1746  /* ARGSUSED */
1733 1747  void
1734 1748  byteswap_uint8_array(void *vbuf, size_t size)
1735 1749  {
1736 1750  }
1737 1751  
1738 1752  void
1739 1753  dmu_init(void)
1740 1754  {
1741 1755          zfs_dbgmsg_init();
1742 1756          sa_cache_init();
1743 1757          xuio_stat_init();
1744 1758          dmu_objset_init();
1745 1759          dnode_init();
1746 1760          dbuf_init();
1747 1761          zfetch_init();
1748 1762          arc_init();
1749 1763          l2arc_init();
1750 1764  }
1751 1765  
1752 1766  void
1753 1767  dmu_fini(void)
1754 1768  {
1755 1769          l2arc_fini();
1756 1770          arc_fini();
1757 1771          zfetch_fini();
1758 1772          dbuf_fini();
1759 1773          dnode_fini();
1760 1774          dmu_objset_fini();
1761 1775          xuio_stat_fini();
1762 1776          sa_cache_fini();
1763 1777          zfs_dbgmsg_fini();
1764 1778  }
  
    | 
      ↓ open down ↓ | 
    248 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX