Print this page
    
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dmu_objset.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_objset.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  
    | 
      ↓ open down ↓ | 
    15 lines elided | 
    
      ↑ open up ↑ | 
  
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       26 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  26   27   */
  27   28  
  28   29  /* Portions Copyright 2010 Robert Milkowski */
  29   30  
  30   31  #include <sys/cred.h>
  31   32  #include <sys/zfs_context.h>
  32   33  #include <sys/dmu_objset.h>
  33   34  #include <sys/dsl_dir.h>
  34   35  #include <sys/dsl_dataset.h>
  35   36  #include <sys/dsl_prop.h>
  36   37  #include <sys/dsl_pool.h>
  37   38  #include <sys/dsl_synctask.h>
  38   39  #include <sys/dsl_deleg.h>
  39   40  #include <sys/dnode.h>
  40   41  #include <sys/dbuf.h>
  41   42  #include <sys/zvol.h>
  42   43  #include <sys/dmu_tx.h>
  43   44  #include <sys/zap.h>
  44   45  #include <sys/zil.h>
  45   46  #include <sys/dmu_impl.h>
  46   47  #include <sys/zfs_ioctl.h>
  47   48  #include <sys/sa.h>
  48   49  #include <sys/zfs_onexit.h>
  49   50  #include <sys/dsl_destroy.h>
  50   51  
  51   52  /*
  52   53   * Needed to close a window in dnode_move() that allows the objset to be freed
  53   54   * before it can be safely accessed.
  54   55   */
  55   56  krwlock_t os_lock;
  56   57  
  57   58  void
  58   59  dmu_objset_init(void)
  59   60  {
  60   61          rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
  61   62  }
  62   63  
  63   64  void
  64   65  dmu_objset_fini(void)
  65   66  {
  66   67          rw_destroy(&os_lock);
  67   68  }
  68   69  
  69   70  spa_t *
  70   71  dmu_objset_spa(objset_t *os)
  71   72  {
  72   73          return (os->os_spa);
  73   74  }
  74   75  
  75   76  zilog_t *
  76   77  dmu_objset_zil(objset_t *os)
  77   78  {
  78   79          return (os->os_zil);
  79   80  }
  80   81  
  81   82  dsl_pool_t *
  82   83  dmu_objset_pool(objset_t *os)
  83   84  {
  84   85          dsl_dataset_t *ds;
  85   86  
  86   87          if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
  87   88                  return (ds->ds_dir->dd_pool);
  88   89          else
  89   90                  return (spa_get_dsl(os->os_spa));
  90   91  }
  91   92  
  92   93  dsl_dataset_t *
  93   94  dmu_objset_ds(objset_t *os)
  94   95  {
  95   96          return (os->os_dsl_dataset);
  96   97  }
  97   98  
  98   99  dmu_objset_type_t
  99  100  dmu_objset_type(objset_t *os)
 100  101  {
 101  102          return (os->os_phys->os_type);
 102  103  }
 103  104  
 104  105  void
 105  106  dmu_objset_name(objset_t *os, char *buf)
 106  107  {
 107  108          dsl_dataset_name(os->os_dsl_dataset, buf);
 108  109  }
 109  110  
 110  111  uint64_t
 111  112  dmu_objset_id(objset_t *os)
 112  113  {
 113  114          dsl_dataset_t *ds = os->os_dsl_dataset;
 114  115  
 115  116          return (ds ? ds->ds_object : 0);
 116  117  }
 117  118  
 118  119  zfs_sync_type_t
 119  120  dmu_objset_syncprop(objset_t *os)
 120  121  {
 121  122          return (os->os_sync);
 122  123  }
 123  124  
 124  125  zfs_logbias_op_t
 125  126  dmu_objset_logbias(objset_t *os)
 126  127  {
 127  128          return (os->os_logbias);
 128  129  }
 129  130  
 130  131  static void
 131  132  checksum_changed_cb(void *arg, uint64_t newval)
 132  133  {
 133  134          objset_t *os = arg;
 134  135  
 135  136          /*
 136  137           * Inheritance should have been done by now.
 137  138           */
 138  139          ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 139  140  
 140  141          os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 141  142  }
 142  143  
 143  144  static void
 144  145  compression_changed_cb(void *arg, uint64_t newval)
 145  146  {
 146  147          objset_t *os = arg;
 147  148  
 148  149          /*
 149  150           * Inheritance and range checking should have been done by now.
 150  151           */
 151  152          ASSERT(newval != ZIO_COMPRESS_INHERIT);
 152  153  
 153  154          os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
 154  155  }
 155  156  
 156  157  static void
 157  158  copies_changed_cb(void *arg, uint64_t newval)
 158  159  {
 159  160          objset_t *os = arg;
 160  161  
 161  162          /*
 162  163           * Inheritance and range checking should have been done by now.
 163  164           */
 164  165          ASSERT(newval > 0);
 165  166          ASSERT(newval <= spa_max_replication(os->os_spa));
 166  167  
 167  168          os->os_copies = newval;
 168  169  }
 169  170  
 170  171  static void
 171  172  dedup_changed_cb(void *arg, uint64_t newval)
 172  173  {
 173  174          objset_t *os = arg;
 174  175          spa_t *spa = os->os_spa;
 175  176          enum zio_checksum checksum;
 176  177  
 177  178          /*
 178  179           * Inheritance should have been done by now.
 179  180           */
 180  181          ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 181  182  
 182  183          checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 183  184  
 184  185          os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 185  186          os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 186  187  }
 187  188  
 188  189  static void
 189  190  primary_cache_changed_cb(void *arg, uint64_t newval)
 190  191  {
 191  192          objset_t *os = arg;
 192  193  
 193  194          /*
 194  195           * Inheritance and range checking should have been done by now.
 195  196           */
 196  197          ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 197  198              newval == ZFS_CACHE_METADATA);
 198  199  
 199  200          os->os_primary_cache = newval;
 200  201  }
 201  202  
 202  203  static void
 203  204  secondary_cache_changed_cb(void *arg, uint64_t newval)
 204  205  {
 205  206          objset_t *os = arg;
 206  207  
 207  208          /*
 208  209           * Inheritance and range checking should have been done by now.
 209  210           */
 210  211          ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 211  212              newval == ZFS_CACHE_METADATA);
 212  213  
 213  214          os->os_secondary_cache = newval;
 214  215  }
 215  216  
 216  217  static void
 217  218  sync_changed_cb(void *arg, uint64_t newval)
 218  219  {
 219  220          objset_t *os = arg;
 220  221  
 221  222          /*
 222  223           * Inheritance and range checking should have been done by now.
 223  224           */
 224  225          ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 225  226              newval == ZFS_SYNC_DISABLED);
 226  227  
 227  228          os->os_sync = newval;
 228  229          if (os->os_zil)
 229  230                  zil_set_sync(os->os_zil, newval);
 230  231  }
 231  232  
 232  233  static void
 233  234  redundant_metadata_changed_cb(void *arg, uint64_t newval)
 234  235  {
 235  236          objset_t *os = arg;
 236  237  
 237  238          /*
 238  239           * Inheritance and range checking should have been done by now.
 239  240           */
 240  241          ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 241  242              newval == ZFS_REDUNDANT_METADATA_MOST);
 242  243  
 243  244          os->os_redundant_metadata = newval;
 244  245  }
 245  246  
 246  247  static void
 247  248  logbias_changed_cb(void *arg, uint64_t newval)
 248  249  {
 249  250          objset_t *os = arg;
 250  251  
 251  252          ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 252  253              newval == ZFS_LOGBIAS_THROUGHPUT);
 253  254          os->os_logbias = newval;
 254  255          if (os->os_zil)
 255  256                  zil_set_logbias(os->os_zil, newval);
 256  257  }
 257  258  
 258  259  static void
 259  260  recordsize_changed_cb(void *arg, uint64_t newval)
 260  261  {
 261  262          objset_t *os = arg;
 262  263  
 263  264          os->os_recordsize = newval;
 264  265  }
 265  266  
 266  267  void
 267  268  dmu_objset_byteswap(void *buf, size_t size)
 268  269  {
 269  270          objset_phys_t *osp = buf;
 270  271  
 271  272          ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 272  273          dnode_byteswap(&osp->os_meta_dnode);
 273  274          byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 274  275          osp->os_type = BSWAP_64(osp->os_type);
 275  276          osp->os_flags = BSWAP_64(osp->os_flags);
 276  277          if (size == sizeof (objset_phys_t)) {
 277  278                  dnode_byteswap(&osp->os_userused_dnode);
 278  279                  dnode_byteswap(&osp->os_groupused_dnode);
 279  280          }
 280  281  }
 281  282  
 282  283  int
 283  284  dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 284  285      objset_t **osp)
 285  286  {
 286  287          objset_t *os;
 287  288          int i, err;
 288  289  
 289  290          ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 290  291  
 291  292          os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 292  293          os->os_dsl_dataset = ds;
 293  294          os->os_spa = spa;
 294  295          os->os_rootbp = bp;
 295  296          if (!BP_IS_HOLE(os->os_rootbp)) {
 296  297                  arc_flags_t aflags = ARC_FLAG_WAIT;
 297  298                  zbookmark_phys_t zb;
 298  299                  SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 299  300                      ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 300  301  
 301  302                  if (DMU_OS_IS_L2CACHEABLE(os))
 302  303                          aflags |= ARC_FLAG_L2CACHE;
 303  304                  if (DMU_OS_IS_L2COMPRESSIBLE(os))
 304  305                          aflags |= ARC_FLAG_L2COMPRESS;
 305  306  
 306  307                  dprintf_bp(os->os_rootbp, "reading %s", "");
 307  308                  err = arc_read(NULL, spa, os->os_rootbp,
 308  309                      arc_getbuf_func, &os->os_phys_buf,
 309  310                      ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 310  311                  if (err != 0) {
 311  312                          kmem_free(os, sizeof (objset_t));
 312  313                          /* convert checksum errors into IO errors */
 313  314                          if (err == ECKSUM)
 314  315                                  err = SET_ERROR(EIO);
 315  316                          return (err);
 316  317                  }
 317  318  
 318  319                  /* Increase the blocksize if we are permitted. */
 319  320                  if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 320  321                      arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 321  322                          arc_buf_t *buf = arc_buf_alloc(spa,
 322  323                              sizeof (objset_phys_t), &os->os_phys_buf,
 323  324                              ARC_BUFC_METADATA);
 324  325                          bzero(buf->b_data, sizeof (objset_phys_t));
 325  326                          bcopy(os->os_phys_buf->b_data, buf->b_data,
 326  327                              arc_buf_size(os->os_phys_buf));
 327  328                          (void) arc_buf_remove_ref(os->os_phys_buf,
 328  329                              &os->os_phys_buf);
 329  330                          os->os_phys_buf = buf;
 330  331                  }
 331  332  
 332  333                  os->os_phys = os->os_phys_buf->b_data;
 333  334                  os->os_flags = os->os_phys->os_flags;
 334  335          } else {
 335  336                  int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 336  337                      sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 337  338                  os->os_phys_buf = arc_buf_alloc(spa, size,
 338  339                      &os->os_phys_buf, ARC_BUFC_METADATA);
 339  340                  os->os_phys = os->os_phys_buf->b_data;
 340  341                  bzero(os->os_phys, size);
 341  342          }
 342  343  
 343  344          /*
 344  345           * Note: the changed_cb will be called once before the register
 345  346           * func returns, thus changing the checksum/compression from the
 346  347           * default (fletcher2/off).  Snapshots don't need to know about
 347  348           * checksum/compression/copies.
  
    | 
      ↓ open down ↓ | 
    312 lines elided | 
    
      ↑ open up ↑ | 
  
 348  349           */
 349  350          if (ds != NULL) {
 350  351                  err = dsl_prop_register(ds,
 351  352                      zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 352  353                      primary_cache_changed_cb, os);
 353  354                  if (err == 0) {
 354  355                          err = dsl_prop_register(ds,
 355  356                              zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 356  357                              secondary_cache_changed_cb, os);
 357  358                  }
 358      -                if (!dsl_dataset_is_snapshot(ds)) {
      359 +                if (!ds->ds_is_snapshot) {
 359  360                          if (err == 0) {
 360  361                                  err = dsl_prop_register(ds,
 361  362                                      zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 362  363                                      checksum_changed_cb, os);
 363  364                          }
 364  365                          if (err == 0) {
 365  366                                  err = dsl_prop_register(ds,
 366  367                                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 367  368                                      compression_changed_cb, os);
 368  369                          }
 369  370                          if (err == 0) {
 370  371                                  err = dsl_prop_register(ds,
 371  372                                      zfs_prop_to_name(ZFS_PROP_COPIES),
 372  373                                      copies_changed_cb, os);
 373  374                          }
 374  375                          if (err == 0) {
 375  376                                  err = dsl_prop_register(ds,
 376  377                                      zfs_prop_to_name(ZFS_PROP_DEDUP),
 377  378                                      dedup_changed_cb, os);
 378  379                          }
 379  380                          if (err == 0) {
 380  381                                  err = dsl_prop_register(ds,
 381  382                                      zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 382  383                                      logbias_changed_cb, os);
 383  384                          }
 384  385                          if (err == 0) {
 385  386                                  err = dsl_prop_register(ds,
 386  387                                      zfs_prop_to_name(ZFS_PROP_SYNC),
 387  388                                      sync_changed_cb, os);
 388  389                          }
 389  390                          if (err == 0) {
 390  391                                  err = dsl_prop_register(ds,
 391  392                                      zfs_prop_to_name(
 392  393                                      ZFS_PROP_REDUNDANT_METADATA),
 393  394                                      redundant_metadata_changed_cb, os);
 394  395                          }
 395  396                          if (err == 0) {
 396  397                                  err = dsl_prop_register(ds,
 397  398                                      zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 398  399                                      recordsize_changed_cb, os);
 399  400                          }
 400  401                  }
 401  402                  if (err != 0) {
 402  403                          VERIFY(arc_buf_remove_ref(os->os_phys_buf,
 403  404                              &os->os_phys_buf));
 404  405                          kmem_free(os, sizeof (objset_t));
 405  406                          return (err);
 406  407                  }
 407  408          } else {
 408  409                  /* It's the meta-objset. */
 409  410                  os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
  
    | 
      ↓ open down ↓ | 
    41 lines elided | 
    
      ↑ open up ↑ | 
  
 410  411                  os->os_compress = ZIO_COMPRESS_LZJB;
 411  412                  os->os_copies = spa_max_replication(spa);
 412  413                  os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 413  414                  os->os_dedup_verify = B_FALSE;
 414  415                  os->os_logbias = ZFS_LOGBIAS_LATENCY;
 415  416                  os->os_sync = ZFS_SYNC_STANDARD;
 416  417                  os->os_primary_cache = ZFS_CACHE_ALL;
 417  418                  os->os_secondary_cache = ZFS_CACHE_ALL;
 418  419          }
 419  420  
 420      -        if (ds == NULL || !dsl_dataset_is_snapshot(ds))
      421 +        if (ds == NULL || !ds->ds_is_snapshot)
 421  422                  os->os_zil_header = os->os_phys->os_zil_header;
 422  423          os->os_zil = zil_alloc(os, &os->os_zil_header);
 423  424  
 424  425          for (i = 0; i < TXG_SIZE; i++) {
 425  426                  list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 426  427                      offsetof(dnode_t, dn_dirty_link[i]));
 427  428                  list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 428  429                      offsetof(dnode_t, dn_dirty_link[i]));
 429  430          }
 430  431          list_create(&os->os_dnodes, sizeof (dnode_t),
 431  432              offsetof(dnode_t, dn_link));
 432  433          list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 433  434              offsetof(dmu_buf_impl_t, db_link));
 434  435  
 435  436          mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 436  437          mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 437  438          mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 438  439  
 439      -        DMU_META_DNODE(os) = dnode_special_open(os,
 440      -            &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
 441      -            &os->os_meta_dnode);
      440 +        dnode_special_open(os, &os->os_phys->os_meta_dnode,
      441 +            DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 442  442          if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 443      -                DMU_USERUSED_DNODE(os) = dnode_special_open(os,
 444      -                    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
 445      -                    &os->os_userused_dnode);
 446      -                DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
 447      -                    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
 448      -                    &os->os_groupused_dnode);
      443 +                dnode_special_open(os, &os->os_phys->os_userused_dnode,
      444 +                    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
      445 +                dnode_special_open(os, &os->os_phys->os_groupused_dnode,
      446 +                    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 449  447          }
 450  448  
 451  449          *osp = os;
 452  450          return (0);
 453  451  }
 454  452  
 455  453  int
 456  454  dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 457  455  {
 458  456          int err = 0;
 459  457  
 460  458          mutex_enter(&ds->ds_opening_lock);
 461  459          if (ds->ds_objset == NULL) {
 462  460                  objset_t *os;
 463  461                  err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 464  462                      ds, dsl_dataset_get_blkptr(ds), &os);
 465  463  
 466  464                  if (err == 0) {
 467  465                          mutex_enter(&ds->ds_lock);
 468  466                          ASSERT(ds->ds_objset == NULL);
 469  467                          ds->ds_objset = os;
 470  468                          mutex_exit(&ds->ds_lock);
 471  469                  }
 472  470          }
 473  471          *osp = ds->ds_objset;
 474  472          mutex_exit(&ds->ds_opening_lock);
 475  473          return (err);
 476  474  }
 477  475  
 478  476  /*
 479  477   * Holds the pool while the objset is held.  Therefore only one objset
 480  478   * can be held at a time.
 481  479   */
 482  480  int
 483  481  dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 484  482  {
 485  483          dsl_pool_t *dp;
 486  484          dsl_dataset_t *ds;
 487  485          int err;
 488  486  
 489  487          err = dsl_pool_hold(name, tag, &dp);
 490  488          if (err != 0)
 491  489                  return (err);
 492  490          err = dsl_dataset_hold(dp, name, tag, &ds);
 493  491          if (err != 0) {
 494  492                  dsl_pool_rele(dp, tag);
 495  493                  return (err);
 496  494          }
 497  495  
 498  496          err = dmu_objset_from_ds(ds, osp);
 499  497          if (err != 0) {
 500  498                  dsl_dataset_rele(ds, tag);
 501  499                  dsl_pool_rele(dp, tag);
 502  500          }
 503  501  
 504  502          return (err);
 505  503  }
 506  504  
 507  505  /*
 508  506   * dsl_pool must not be held when this is called.
 509  507   * Upon successful return, there will be a longhold on the dataset,
 510  508   * and the dsl_pool will not be held.
 511  509   */
 512  510  int
 513  511  dmu_objset_own(const char *name, dmu_objset_type_t type,
 514  512      boolean_t readonly, void *tag, objset_t **osp)
 515  513  {
 516  514          dsl_pool_t *dp;
 517  515          dsl_dataset_t *ds;
 518  516          int err;
 519  517  
 520  518          err = dsl_pool_hold(name, FTAG, &dp);
 521  519          if (err != 0)
 522  520                  return (err);
 523  521          err = dsl_dataset_own(dp, name, tag, &ds);
 524  522          if (err != 0) {
 525  523                  dsl_pool_rele(dp, FTAG);
  
    | 
      ↓ open down ↓ | 
    67 lines elided | 
    
      ↑ open up ↑ | 
  
 526  524                  return (err);
 527  525          }
 528  526  
 529  527          err = dmu_objset_from_ds(ds, osp);
 530  528          dsl_pool_rele(dp, FTAG);
 531  529          if (err != 0) {
 532  530                  dsl_dataset_disown(ds, tag);
 533  531          } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 534  532                  dsl_dataset_disown(ds, tag);
 535  533                  return (SET_ERROR(EINVAL));
 536      -        } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
      534 +        } else if (!readonly && ds->ds_is_snapshot) {
 537  535                  dsl_dataset_disown(ds, tag);
 538  536                  return (SET_ERROR(EROFS));
 539  537          }
 540  538          return (err);
 541  539  }
 542  540  
 543  541  void
 544  542  dmu_objset_rele(objset_t *os, void *tag)
 545  543  {
 546  544          dsl_pool_t *dp = dmu_objset_pool(os);
 547  545          dsl_dataset_rele(os->os_dsl_dataset, tag);
 548  546          dsl_pool_rele(dp, tag);
 549  547  }
 550  548  
 551  549  /*
 552  550   * When we are called, os MUST refer to an objset associated with a dataset
 553  551   * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
 554  552   * == tag.  We will then release and reacquire ownership of the dataset while
 555  553   * holding the pool config_rwlock to avoid intervening namespace or ownership
 556  554   * changes may occur.
 557  555   *
 558  556   * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
 559  557   * release the hold on its dataset and acquire a new one on the dataset of the
 560  558   * same name so that it can be partially torn down and reconstructed.
 561  559   */
 562  560  void
 563  561  dmu_objset_refresh_ownership(objset_t *os, void *tag)
 564  562  {
 565  563          dsl_pool_t *dp;
 566  564          dsl_dataset_t *ds, *newds;
 567  565          char name[MAXNAMELEN];
 568  566  
 569  567          ds = os->os_dsl_dataset;
 570  568          VERIFY3P(ds, !=, NULL);
 571  569          VERIFY3P(ds->ds_owner, ==, tag);
 572  570          VERIFY(dsl_dataset_long_held(ds));
 573  571  
 574  572          dsl_dataset_name(ds, name);
 575  573          dp = dmu_objset_pool(os);
 576  574          dsl_pool_config_enter(dp, FTAG);
 577  575          dmu_objset_disown(os, tag);
 578  576          VERIFY0(dsl_dataset_own(dp, name, tag, &newds));
 579  577          VERIFY3P(newds, ==, os->os_dsl_dataset);
 580  578          dsl_pool_config_exit(dp, FTAG);
 581  579  }
  
    | 
      ↓ open down ↓ | 
    35 lines elided | 
    
      ↑ open up ↑ | 
  
 582  580  
 583  581  void
 584  582  dmu_objset_disown(objset_t *os, void *tag)
 585  583  {
 586  584          dsl_dataset_disown(os->os_dsl_dataset, tag);
 587  585  }
 588  586  
 589  587  void
 590  588  dmu_objset_evict_dbufs(objset_t *os)
 591  589  {
      590 +        dnode_t dn_marker;
 592  591          dnode_t *dn;
 593  592  
 594  593          mutex_enter(&os->os_lock);
      594 +        dn = list_head(&os->os_dnodes);
      595 +        while (dn != NULL) {
      596 +                /*
      597 +                 * Skip dnodes without holds.  We have to do this dance
      598 +                 * because dnode_add_ref() only works if there is already a
      599 +                 * hold.  If the dnode has no holds, then it has no dbufs.
      600 +                 */
      601 +                if (dnode_add_ref(dn, FTAG)) {
      602 +                        list_insert_after(&os->os_dnodes, dn, &dn_marker);
      603 +                        mutex_exit(&os->os_lock);
 595  604  
 596      -        /* process the mdn last, since the other dnodes have holds on it */
 597      -        list_remove(&os->os_dnodes, DMU_META_DNODE(os));
 598      -        list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
      605 +                        dnode_evict_dbufs(dn);
      606 +                        dnode_rele(dn, FTAG);
 599  607  
 600      -        /*
 601      -         * Find the first dnode with holds.  We have to do this dance
 602      -         * because dnode_add_ref() only works if you already have a
 603      -         * hold.  If there are no holds then it has no dbufs so OK to
 604      -         * skip.
 605      -         */
 606      -        for (dn = list_head(&os->os_dnodes);
 607      -            dn && !dnode_add_ref(dn, FTAG);
 608      -            dn = list_next(&os->os_dnodes, dn))
 609      -                continue;
 610      -
 611      -        while (dn) {
 612      -                dnode_t *next_dn = dn;
 613      -
 614      -                do {
 615      -                        next_dn = list_next(&os->os_dnodes, next_dn);
 616      -                } while (next_dn && !dnode_add_ref(next_dn, FTAG));
 617      -
 618      -                mutex_exit(&os->os_lock);
 619      -                dnode_evict_dbufs(dn);
 620      -                dnode_rele(dn, FTAG);
 621      -                mutex_enter(&os->os_lock);
 622      -                dn = next_dn;
      608 +                        mutex_enter(&os->os_lock);
      609 +                        dn = list_next(&os->os_dnodes, &dn_marker);
      610 +                        list_remove(&os->os_dnodes, &dn_marker);
      611 +                } else {
      612 +                        dn = list_next(&os->os_dnodes, dn);
      613 +                }
 623  614          }
 624  615          mutex_exit(&os->os_lock);
      616 +
      617 +        if (DMU_USERUSED_DNODE(os) != NULL) {
      618 +                dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
      619 +                dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
      620 +        }
      621 +        dnode_evict_dbufs(DMU_META_DNODE(os));
 625  622  }
 626  623  
      624 +/*
      625 + * Objset eviction processing is split into into two pieces.
      626 + * The first marks the objset as evicting, evicts any dbufs that
      627 + * have a refcount of zero, and then queues up the objset for the
      628 + * second phase of eviction.  Once os->os_dnodes has been cleared by
      629 + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
      630 + * The second phase closes the special dnodes, dequeues the objset from
      631 + * the list of those undergoing eviction, and finally frees the objset.
      632 + *
      633 + * NOTE: Due to asynchronous eviction processing (invocation of
      634 + *       dnode_buf_pageout()), it is possible for the meta dnode for the
      635 + *       objset to have no holds even though os->os_dnodes is not empty.
      636 + */
 627  637  void
 628  638  dmu_objset_evict(objset_t *os)
 629  639  {
 630  640          dsl_dataset_t *ds = os->os_dsl_dataset;
 631  641  
 632  642          for (int t = 0; t < TXG_SIZE; t++)
 633  643                  ASSERT(!dmu_objset_is_dirty(os, t));
 634  644  
 635  645          if (ds) {
 636      -                if (!dsl_dataset_is_snapshot(ds)) {
      646 +                if (!ds->ds_is_snapshot) {
 637  647                          VERIFY0(dsl_prop_unregister(ds,
 638  648                              zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 639  649                              checksum_changed_cb, os));
 640  650                          VERIFY0(dsl_prop_unregister(ds,
 641  651                              zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 642  652                              compression_changed_cb, os));
 643  653                          VERIFY0(dsl_prop_unregister(ds,
 644  654                              zfs_prop_to_name(ZFS_PROP_COPIES),
 645  655                              copies_changed_cb, os));
 646  656                          VERIFY0(dsl_prop_unregister(ds,
 647  657                              zfs_prop_to_name(ZFS_PROP_DEDUP),
 648  658                              dedup_changed_cb, os));
 649  659                          VERIFY0(dsl_prop_unregister(ds,
 650  660                              zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 651  661                              logbias_changed_cb, os));
 652  662                          VERIFY0(dsl_prop_unregister(ds,
 653  663                              zfs_prop_to_name(ZFS_PROP_SYNC),
 654  664                              sync_changed_cb, os));
 655  665                          VERIFY0(dsl_prop_unregister(ds,
 656  666                              zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
 657  667                              redundant_metadata_changed_cb, os));
 658  668                          VERIFY0(dsl_prop_unregister(ds,
 659  669                              zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 660  670                              recordsize_changed_cb, os));
 661  671                  }
 662  672                  VERIFY0(dsl_prop_unregister(ds,
  
    | 
      ↓ open down ↓ | 
    16 lines elided | 
    
      ↑ open up ↑ | 
  
 663  673                      zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 664  674                      primary_cache_changed_cb, os));
 665  675                  VERIFY0(dsl_prop_unregister(ds,
 666  676                      zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 667  677                      secondary_cache_changed_cb, os));
 668  678          }
 669  679  
 670  680          if (os->os_sa)
 671  681                  sa_tear_down(os);
 672  682  
      683 +        os->os_evicting = B_TRUE;
 673  684          dmu_objset_evict_dbufs(os);
 674  685  
      686 +        mutex_enter(&os->os_lock);
      687 +        spa_evicting_os_register(os->os_spa, os);
      688 +        if (list_is_empty(&os->os_dnodes)) {
      689 +                mutex_exit(&os->os_lock);
      690 +                dmu_objset_evict_done(os);
      691 +        } else {
      692 +                mutex_exit(&os->os_lock);
      693 +        }
      694 +}
      695 +
      696 +void
      697 +dmu_objset_evict_done(objset_t *os)
      698 +{
      699 +        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
      700 +
 675  701          dnode_special_close(&os->os_meta_dnode);
 676  702          if (DMU_USERUSED_DNODE(os)) {
 677  703                  dnode_special_close(&os->os_userused_dnode);
 678  704                  dnode_special_close(&os->os_groupused_dnode);
 679  705          }
 680  706          zil_free(os->os_zil);
 681  707  
 682      -        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 683      -
 684  708          VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 685  709  
 686  710          /*
 687  711           * This is a barrier to prevent the objset from going away in
 688  712           * dnode_move() until we can safely ensure that the objset is still in
 689  713           * use. We consider the objset valid before the barrier and invalid
 690  714           * after the barrier.
 691  715           */
 692  716          rw_enter(&os_lock, RW_READER);
 693  717          rw_exit(&os_lock);
 694  718  
 695  719          mutex_destroy(&os->os_lock);
 696  720          mutex_destroy(&os->os_obj_lock);
 697  721          mutex_destroy(&os->os_user_ptr_lock);
      722 +        spa_evicting_os_deregister(os->os_spa, os);
 698  723          kmem_free(os, sizeof (objset_t));
 699  724  }
 700  725  
 701  726  timestruc_t
 702  727  dmu_objset_snap_cmtime(objset_t *os)
 703  728  {
 704  729          return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 705  730  }
 706  731  
 707  732  /* called from dsl for meta-objset */
 708  733  objset_t *
 709  734  dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 710  735      dmu_objset_type_t type, dmu_tx_t *tx)
 711  736  {
 712  737          objset_t *os;
 713  738          dnode_t *mdn;
 714  739  
 715  740          ASSERT(dmu_tx_is_syncing(tx));
 716  741  
 717  742          if (ds != NULL)
 718  743                  VERIFY0(dmu_objset_from_ds(ds, &os));
 719  744          else
 720  745                  VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 721  746  
 722  747          mdn = DMU_META_DNODE(os);
 723  748  
 724  749          dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 725  750              DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
 726  751  
 727  752          /*
 728  753           * We don't want to have to increase the meta-dnode's nlevels
 729  754           * later, because then we could do it in quescing context while
 730  755           * we are also accessing it in open context.
 731  756           *
 732  757           * This precaution is not necessary for the MOS (ds == NULL),
 733  758           * because the MOS is only updated in syncing context.
 734  759           * This is most fortunate: the MOS is the only objset that
 735  760           * needs to be synced multiple times as spa_sync() iterates
 736  761           * to convergence, so minimizing its dn_nlevels matters.
 737  762           */
 738  763          if (ds != NULL) {
 739  764                  int levels = 1;
 740  765  
 741  766                  /*
 742  767                   * Determine the number of levels necessary for the meta-dnode
 743  768                   * to contain DN_MAX_OBJECT dnodes.
 744  769                   */
 745  770                  while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift +
 746  771                      (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 747  772                      DN_MAX_OBJECT * sizeof (dnode_phys_t))
 748  773                          levels++;
 749  774  
 750  775                  mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 751  776                      mdn->dn_nlevels = levels;
 752  777          }
 753  778  
 754  779          ASSERT(type != DMU_OST_NONE);
 755  780          ASSERT(type != DMU_OST_ANY);
 756  781          ASSERT(type < DMU_OST_NUMTYPES);
 757  782          os->os_phys->os_type = type;
 758  783          if (dmu_objset_userused_enabled(os)) {
 759  784                  os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 760  785                  os->os_flags = os->os_phys->os_flags;
 761  786          }
 762  787  
 763  788          dsl_dataset_dirty(ds, tx);
 764  789  
 765  790          return (os);
 766  791  }
 767  792  
 768  793  typedef struct dmu_objset_create_arg {
 769  794          const char *doca_name;
 770  795          cred_t *doca_cred;
 771  796          void (*doca_userfunc)(objset_t *os, void *arg,
 772  797              cred_t *cr, dmu_tx_t *tx);
 773  798          void *doca_userarg;
 774  799          dmu_objset_type_t doca_type;
 775  800          uint64_t doca_flags;
 776  801  } dmu_objset_create_arg_t;
 777  802  
 778  803  /*ARGSUSED*/
 779  804  static int
 780  805  dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 781  806  {
 782  807          dmu_objset_create_arg_t *doca = arg;
 783  808          dsl_pool_t *dp = dmu_tx_pool(tx);
 784  809          dsl_dir_t *pdd;
 785  810          const char *tail;
 786  811          int error;
 787  812  
 788  813          if (strchr(doca->doca_name, '@') != NULL)
 789  814                  return (SET_ERROR(EINVAL));
 790  815  
 791  816          error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 792  817          if (error != 0)
 793  818                  return (error);
 794  819          if (tail == NULL) {
 795  820                  dsl_dir_rele(pdd, FTAG);
 796  821                  return (SET_ERROR(EEXIST));
 797  822          }
 798  823          error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 799  824              doca->doca_cred);
 800  825          dsl_dir_rele(pdd, FTAG);
 801  826  
 802  827          return (error);
 803  828  }
 804  829  
 805  830  static void
 806  831  dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 807  832  {
 808  833          dmu_objset_create_arg_t *doca = arg;
 809  834          dsl_pool_t *dp = dmu_tx_pool(tx);
 810  835          dsl_dir_t *pdd;
 811  836          const char *tail;
 812  837          dsl_dataset_t *ds;
 813  838          uint64_t obj;
 814  839          blkptr_t *bp;
 815  840          objset_t *os;
 816  841  
 817  842          VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 818  843  
 819  844          obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 820  845              doca->doca_cred, tx);
 821  846  
 822  847          VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 823  848          bp = dsl_dataset_get_blkptr(ds);
 824  849          os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 825  850              ds, bp, doca->doca_type, tx);
 826  851  
 827  852          if (doca->doca_userfunc != NULL) {
 828  853                  doca->doca_userfunc(os, doca->doca_userarg,
 829  854                      doca->doca_cred, tx);
 830  855          }
 831  856  
 832  857          spa_history_log_internal_ds(ds, "create", tx, "");
 833  858          dsl_dataset_rele(ds, FTAG);
 834  859          dsl_dir_rele(pdd, FTAG);
 835  860  }
 836  861  
 837  862  int
 838  863  dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
 839  864      void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 840  865  {
 841  866          dmu_objset_create_arg_t doca;
 842  867  
 843  868          doca.doca_name = name;
 844  869          doca.doca_cred = CRED();
 845  870          doca.doca_flags = flags;
 846  871          doca.doca_userfunc = func;
 847  872          doca.doca_userarg = arg;
 848  873          doca.doca_type = type;
 849  874  
 850  875          return (dsl_sync_task(name,
 851  876              dmu_objset_create_check, dmu_objset_create_sync, &doca,
 852  877              5, ZFS_SPACE_CHECK_NORMAL));
 853  878  }
 854  879  
 855  880  typedef struct dmu_objset_clone_arg {
 856  881          const char *doca_clone;
 857  882          const char *doca_origin;
 858  883          cred_t *doca_cred;
 859  884  } dmu_objset_clone_arg_t;
 860  885  
 861  886  /*ARGSUSED*/
 862  887  static int
 863  888  dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 864  889  {
 865  890          dmu_objset_clone_arg_t *doca = arg;
 866  891          dsl_dir_t *pdd;
 867  892          const char *tail;
 868  893          int error;
 869  894          dsl_dataset_t *origin;
 870  895          dsl_pool_t *dp = dmu_tx_pool(tx);
 871  896  
 872  897          if (strchr(doca->doca_clone, '@') != NULL)
 873  898                  return (SET_ERROR(EINVAL));
 874  899  
 875  900          error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 876  901          if (error != 0)
 877  902                  return (error);
 878  903          if (tail == NULL) {
 879  904                  dsl_dir_rele(pdd, FTAG);
 880  905                  return (SET_ERROR(EEXIST));
 881  906          }
 882  907          /* You can't clone across pools. */
 883  908          if (pdd->dd_pool != dp) {
 884  909                  dsl_dir_rele(pdd, FTAG);
 885  910                  return (SET_ERROR(EXDEV));
 886  911          }
 887  912          error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 888  913              doca->doca_cred);
 889  914          if (error != 0) {
 890  915                  dsl_dir_rele(pdd, FTAG);
 891  916                  return (SET_ERROR(EDQUOT));
 892  917          }
 893  918          dsl_dir_rele(pdd, FTAG);
 894  919  
 895  920          error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
  
    | 
      ↓ open down ↓ | 
    188 lines elided | 
    
      ↑ open up ↑ | 
  
 896  921          if (error != 0)
 897  922                  return (error);
 898  923  
 899  924          /* You can't clone across pools. */
 900  925          if (origin->ds_dir->dd_pool != dp) {
 901  926                  dsl_dataset_rele(origin, FTAG);
 902  927                  return (SET_ERROR(EXDEV));
 903  928          }
 904  929  
 905  930          /* You can only clone snapshots, not the head datasets. */
 906      -        if (!dsl_dataset_is_snapshot(origin)) {
      931 +        if (!origin->ds_is_snapshot) {
 907  932                  dsl_dataset_rele(origin, FTAG);
 908  933                  return (SET_ERROR(EINVAL));
 909  934          }
 910  935          dsl_dataset_rele(origin, FTAG);
 911  936  
 912  937          return (0);
 913  938  }
 914  939  
 915  940  static void
 916  941  dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 917  942  {
 918  943          dmu_objset_clone_arg_t *doca = arg;
 919  944          dsl_pool_t *dp = dmu_tx_pool(tx);
 920  945          dsl_dir_t *pdd;
 921  946          const char *tail;
 922  947          dsl_dataset_t *origin, *ds;
 923  948          uint64_t obj;
 924  949          char namebuf[MAXNAMELEN];
 925  950  
 926  951          VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
 927  952          VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 928  953  
 929  954          obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
 930  955              doca->doca_cred, tx);
 931  956  
 932  957          VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 933  958          dsl_dataset_name(origin, namebuf);
 934  959          spa_history_log_internal_ds(ds, "clone", tx,
 935  960              "origin=%s (%llu)", namebuf, origin->ds_object);
 936  961          dsl_dataset_rele(ds, FTAG);
 937  962          dsl_dataset_rele(origin, FTAG);
 938  963          dsl_dir_rele(pdd, FTAG);
 939  964  }
 940  965  
 941  966  int
 942  967  dmu_objset_clone(const char *clone, const char *origin)
 943  968  {
 944  969          dmu_objset_clone_arg_t doca;
 945  970  
 946  971          doca.doca_clone = clone;
 947  972          doca.doca_origin = origin;
 948  973          doca.doca_cred = CRED();
 949  974  
 950  975          return (dsl_sync_task(clone,
 951  976              dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
 952  977              5, ZFS_SPACE_CHECK_NORMAL));
 953  978  }
 954  979  
 955  980  int
 956  981  dmu_objset_snapshot_one(const char *fsname, const char *snapname)
 957  982  {
 958  983          int err;
 959  984          char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
 960  985          nvlist_t *snaps = fnvlist_alloc();
 961  986  
 962  987          fnvlist_add_boolean(snaps, longsnap);
 963  988          strfree(longsnap);
 964  989          err = dsl_dataset_snapshot(snaps, NULL, NULL);
 965  990          fnvlist_free(snaps);
 966  991          return (err);
 967  992  }
 968  993  
 969  994  static void
 970  995  dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 971  996  {
 972  997          dnode_t *dn;
 973  998  
 974  999          while (dn = list_head(list)) {
 975 1000                  ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 976 1001                  ASSERT(dn->dn_dbuf->db_data_pending);
 977 1002                  /*
 978 1003                   * Initialize dn_zio outside dnode_sync() because the
 979 1004                   * meta-dnode needs to set it ouside dnode_sync().
 980 1005                   */
 981 1006                  dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 982 1007                  ASSERT(dn->dn_zio);
 983 1008  
 984 1009                  ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 985 1010                  list_remove(list, dn);
 986 1011  
 987 1012                  if (newlist) {
 988 1013                          (void) dnode_add_ref(dn, newlist);
 989 1014                          list_insert_tail(newlist, dn);
 990 1015                  }
 991 1016  
 992 1017                  dnode_sync(dn, tx);
 993 1018          }
 994 1019  }
 995 1020  
 996 1021  /* ARGSUSED */
 997 1022  static void
 998 1023  dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 999 1024  {
1000 1025          blkptr_t *bp = zio->io_bp;
1001 1026          objset_t *os = arg;
1002 1027          dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
1003 1028  
1004 1029          ASSERT(!BP_IS_EMBEDDED(bp));
1005 1030          ASSERT3P(bp, ==, os->os_rootbp);
1006 1031          ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
1007 1032          ASSERT0(BP_GET_LEVEL(bp));
1008 1033  
1009 1034          /*
1010 1035           * Update rootbp fill count: it should be the number of objects
1011 1036           * allocated in the object set (not counting the "special"
1012 1037           * objects that are stored in the objset_phys_t -- the meta
1013 1038           * dnode and user/group accounting objects).
1014 1039           */
1015 1040          bp->blk_fill = 0;
1016 1041          for (int i = 0; i < dnp->dn_nblkptr; i++)
1017 1042                  bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
1018 1043  }
1019 1044  
1020 1045  /* ARGSUSED */
1021 1046  static void
1022 1047  dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
1023 1048  {
1024 1049          blkptr_t *bp = zio->io_bp;
1025 1050          blkptr_t *bp_orig = &zio->io_bp_orig;
1026 1051          objset_t *os = arg;
1027 1052  
1028 1053          if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
1029 1054                  ASSERT(BP_EQUAL(bp, bp_orig));
1030 1055          } else {
1031 1056                  dsl_dataset_t *ds = os->os_dsl_dataset;
1032 1057                  dmu_tx_t *tx = os->os_synctx;
1033 1058  
1034 1059                  (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
1035 1060                  dsl_dataset_block_born(ds, bp, tx);
1036 1061          }
1037 1062  }
1038 1063  
1039 1064  /* called from dsl */
1040 1065  void
1041 1066  dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
1042 1067  {
1043 1068          int txgoff;
1044 1069          zbookmark_phys_t zb;
1045 1070          zio_prop_t zp;
1046 1071          zio_t *zio;
1047 1072          list_t *list;
1048 1073          list_t *newlist = NULL;
1049 1074          dbuf_dirty_record_t *dr;
1050 1075  
1051 1076          dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
1052 1077  
1053 1078          ASSERT(dmu_tx_is_syncing(tx));
1054 1079          /* XXX the write_done callback should really give us the tx... */
1055 1080          os->os_synctx = tx;
1056 1081  
1057 1082          if (os->os_dsl_dataset == NULL) {
1058 1083                  /*
1059 1084                   * This is the MOS.  If we have upgraded,
1060 1085                   * spa_max_replication() could change, so reset
1061 1086                   * os_copies here.
1062 1087                   */
1063 1088                  os->os_copies = spa_max_replication(os->os_spa);
1064 1089          }
1065 1090  
1066 1091          /*
1067 1092           * Create the root block IO
1068 1093           */
1069 1094          SET_BOOKMARK(&zb, os->os_dsl_dataset ?
1070 1095              os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
1071 1096              ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
1072 1097          arc_release(os->os_phys_buf, &os->os_phys_buf);
1073 1098  
1074 1099          dmu_write_policy(os, NULL, 0, 0, &zp);
1075 1100  
1076 1101          zio = arc_write(pio, os->os_spa, tx->tx_txg,
1077 1102              os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
1078 1103              DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready,
1079 1104              NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE,
1080 1105              ZIO_FLAG_MUSTSUCCEED, &zb);
1081 1106  
1082 1107          /*
1083 1108           * Sync special dnodes - the parent IO for the sync is the root block
1084 1109           */
1085 1110          DMU_META_DNODE(os)->dn_zio = zio;
1086 1111          dnode_sync(DMU_META_DNODE(os), tx);
1087 1112  
1088 1113          os->os_phys->os_flags = os->os_flags;
1089 1114  
1090 1115          if (DMU_USERUSED_DNODE(os) &&
1091 1116              DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
1092 1117                  DMU_USERUSED_DNODE(os)->dn_zio = zio;
1093 1118                  dnode_sync(DMU_USERUSED_DNODE(os), tx);
1094 1119                  DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
1095 1120                  dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
1096 1121          }
1097 1122  
1098 1123          txgoff = tx->tx_txg & TXG_MASK;
1099 1124  
1100 1125          if (dmu_objset_userused_enabled(os)) {
1101 1126                  newlist = &os->os_synced_dnodes;
1102 1127                  /*
1103 1128                   * We must create the list here because it uses the
1104 1129                   * dn_dirty_link[] of this txg.
1105 1130                   */
1106 1131                  list_create(newlist, sizeof (dnode_t),
1107 1132                      offsetof(dnode_t, dn_dirty_link[txgoff]));
1108 1133          }
1109 1134  
1110 1135          dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
1111 1136          dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
1112 1137  
1113 1138          list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
1114 1139          while (dr = list_head(list)) {
1115 1140                  ASSERT0(dr->dr_dbuf->db_level);
1116 1141                  list_remove(list, dr);
1117 1142                  if (dr->dr_zio)
1118 1143                          zio_nowait(dr->dr_zio);
1119 1144          }
1120 1145          /*
1121 1146           * Free intent log blocks up to this tx.
1122 1147           */
1123 1148          zil_sync(os->os_zil, tx);
1124 1149          os->os_phys->os_zil_header = os->os_zil_header;
1125 1150          zio_nowait(zio);
1126 1151  }
1127 1152  
1128 1153  boolean_t
1129 1154  dmu_objset_is_dirty(objset_t *os, uint64_t txg)
1130 1155  {
1131 1156          return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
1132 1157              !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
1133 1158  }
1134 1159  
1135 1160  static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
1136 1161  
1137 1162  void
1138 1163  dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
1139 1164  {
1140 1165          used_cbs[ost] = cb;
1141 1166  }
1142 1167  
1143 1168  boolean_t
1144 1169  dmu_objset_userused_enabled(objset_t *os)
1145 1170  {
1146 1171          return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
1147 1172              used_cbs[os->os_phys->os_type] != NULL &&
1148 1173              DMU_USERUSED_DNODE(os) != NULL);
1149 1174  }
1150 1175  
1151 1176  static void
1152 1177  do_userquota_update(objset_t *os, uint64_t used, uint64_t flags,
1153 1178      uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx)
1154 1179  {
1155 1180          if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
1156 1181                  int64_t delta = DNODE_SIZE + used;
1157 1182                  if (subtract)
1158 1183                          delta = -delta;
1159 1184                  VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
1160 1185                      user, delta, tx));
1161 1186                  VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
1162 1187                      group, delta, tx));
1163 1188          }
1164 1189  }
1165 1190  
1166 1191  void
1167 1192  dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
1168 1193  {
1169 1194          dnode_t *dn;
1170 1195          list_t *list = &os->os_synced_dnodes;
1171 1196  
1172 1197          ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
1173 1198  
1174 1199          while (dn = list_head(list)) {
1175 1200                  int flags;
1176 1201                  ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
1177 1202                  ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
1178 1203                      dn->dn_phys->dn_flags &
1179 1204                      DNODE_FLAG_USERUSED_ACCOUNTED);
1180 1205  
1181 1206                  /* Allocate the user/groupused objects if necessary. */
1182 1207                  if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
1183 1208                          VERIFY(0 == zap_create_claim(os,
1184 1209                              DMU_USERUSED_OBJECT,
1185 1210                              DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1186 1211                          VERIFY(0 == zap_create_claim(os,
1187 1212                              DMU_GROUPUSED_OBJECT,
1188 1213                              DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
1189 1214                  }
1190 1215  
1191 1216                  /*
1192 1217                   * We intentionally modify the zap object even if the
1193 1218                   * net delta is zero.  Otherwise
1194 1219                   * the block of the zap obj could be shared between
1195 1220                   * datasets but need to be different between them after
1196 1221                   * a bprewrite.
1197 1222                   */
1198 1223  
1199 1224                  flags = dn->dn_id_flags;
1200 1225                  ASSERT(flags);
1201 1226                  if (flags & DN_ID_OLD_EXIST)  {
1202 1227                          do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags,
1203 1228                              dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx);
1204 1229                  }
1205 1230                  if (flags & DN_ID_NEW_EXIST) {
1206 1231                          do_userquota_update(os, DN_USED_BYTES(dn->dn_phys),
1207 1232                              dn->dn_phys->dn_flags,  dn->dn_newuid,
1208 1233                              dn->dn_newgid, B_FALSE, tx);
1209 1234                  }
1210 1235  
1211 1236                  mutex_enter(&dn->dn_mtx);
1212 1237                  dn->dn_oldused = 0;
1213 1238                  dn->dn_oldflags = 0;
1214 1239                  if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
1215 1240                          dn->dn_olduid = dn->dn_newuid;
1216 1241                          dn->dn_oldgid = dn->dn_newgid;
1217 1242                          dn->dn_id_flags |= DN_ID_OLD_EXIST;
1218 1243                          if (dn->dn_bonuslen == 0)
1219 1244                                  dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1220 1245                          else
1221 1246                                  dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1222 1247                  }
1223 1248                  dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
1224 1249                  mutex_exit(&dn->dn_mtx);
1225 1250  
1226 1251                  list_remove(list, dn);
1227 1252                  dnode_rele(dn, list);
1228 1253          }
1229 1254  }
1230 1255  
1231 1256  /*
1232 1257   * Returns a pointer to data to find uid/gid from
1233 1258   *
1234 1259   * If a dirty record for transaction group that is syncing can't
1235 1260   * be found then NULL is returned.  In the NULL case it is assumed
1236 1261   * the uid/gid aren't changing.
1237 1262   */
1238 1263  static void *
1239 1264  dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
1240 1265  {
1241 1266          dbuf_dirty_record_t *dr, **drp;
1242 1267          void *data;
1243 1268  
1244 1269          if (db->db_dirtycnt == 0)
1245 1270                  return (db->db.db_data);  /* Nothing is changing */
1246 1271  
1247 1272          for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
1248 1273                  if (dr->dr_txg == tx->tx_txg)
1249 1274                          break;
1250 1275  
1251 1276          if (dr == NULL) {
1252 1277                  data = NULL;
1253 1278          } else {
1254 1279                  dnode_t *dn;
1255 1280  
1256 1281                  DB_DNODE_ENTER(dr->dr_dbuf);
1257 1282                  dn = DB_DNODE(dr->dr_dbuf);
1258 1283  
1259 1284                  if (dn->dn_bonuslen == 0 &&
1260 1285                      dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
1261 1286                          data = dr->dt.dl.dr_data->b_data;
1262 1287                  else
1263 1288                          data = dr->dt.dl.dr_data;
1264 1289  
1265 1290                  DB_DNODE_EXIT(dr->dr_dbuf);
1266 1291          }
1267 1292  
1268 1293          return (data);
1269 1294  }
1270 1295  
1271 1296  void
1272 1297  dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
1273 1298  {
1274 1299          objset_t *os = dn->dn_objset;
1275 1300          void *data = NULL;
1276 1301          dmu_buf_impl_t *db = NULL;
1277 1302          uint64_t *user = NULL;
1278 1303          uint64_t *group = NULL;
1279 1304          int flags = dn->dn_id_flags;
1280 1305          int error;
1281 1306          boolean_t have_spill = B_FALSE;
1282 1307  
1283 1308          if (!dmu_objset_userused_enabled(dn->dn_objset))
1284 1309                  return;
1285 1310  
1286 1311          if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
1287 1312              DN_ID_CHKED_SPILL)))
1288 1313                  return;
1289 1314  
1290 1315          if (before && dn->dn_bonuslen != 0)
1291 1316                  data = DN_BONUS(dn->dn_phys);
1292 1317          else if (!before && dn->dn_bonuslen != 0) {
1293 1318                  if (dn->dn_bonus) {
1294 1319                          db = dn->dn_bonus;
1295 1320                          mutex_enter(&db->db_mtx);
1296 1321                          data = dmu_objset_userquota_find_data(db, tx);
1297 1322                  } else {
1298 1323                          data = DN_BONUS(dn->dn_phys);
1299 1324                  }
1300 1325          } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
1301 1326                          int rf = 0;
1302 1327  
1303 1328                          if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
1304 1329                                  rf |= DB_RF_HAVESTRUCT;
1305 1330                          error = dmu_spill_hold_by_dnode(dn,
1306 1331                              rf | DB_RF_MUST_SUCCEED,
1307 1332                              FTAG, (dmu_buf_t **)&db);
1308 1333                          ASSERT(error == 0);
1309 1334                          mutex_enter(&db->db_mtx);
1310 1335                          data = (before) ? db->db.db_data :
1311 1336                              dmu_objset_userquota_find_data(db, tx);
1312 1337                          have_spill = B_TRUE;
1313 1338          } else {
1314 1339                  mutex_enter(&dn->dn_mtx);
1315 1340                  dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1316 1341                  mutex_exit(&dn->dn_mtx);
1317 1342                  return;
1318 1343          }
1319 1344  
1320 1345          if (before) {
1321 1346                  ASSERT(data);
1322 1347                  user = &dn->dn_olduid;
1323 1348                  group = &dn->dn_oldgid;
1324 1349          } else if (data) {
1325 1350                  user = &dn->dn_newuid;
1326 1351                  group = &dn->dn_newgid;
1327 1352          }
1328 1353  
1329 1354          /*
1330 1355           * Must always call the callback in case the object
1331 1356           * type has changed and that type isn't an object type to track
1332 1357           */
1333 1358          error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
1334 1359              user, group);
1335 1360  
1336 1361          /*
1337 1362           * Preserve existing uid/gid when the callback can't determine
1338 1363           * what the new uid/gid are and the callback returned EEXIST.
1339 1364           * The EEXIST error tells us to just use the existing uid/gid.
1340 1365           * If we don't know what the old values are then just assign
1341 1366           * them to 0, since that is a new file  being created.
1342 1367           */
1343 1368          if (!before && data == NULL && error == EEXIST) {
1344 1369                  if (flags & DN_ID_OLD_EXIST) {
1345 1370                          dn->dn_newuid = dn->dn_olduid;
1346 1371                          dn->dn_newgid = dn->dn_oldgid;
1347 1372                  } else {
1348 1373                          dn->dn_newuid = 0;
1349 1374                          dn->dn_newgid = 0;
1350 1375                  }
1351 1376                  error = 0;
1352 1377          }
1353 1378  
1354 1379          if (db)
1355 1380                  mutex_exit(&db->db_mtx);
1356 1381  
1357 1382          mutex_enter(&dn->dn_mtx);
1358 1383          if (error == 0 && before)
1359 1384                  dn->dn_id_flags |= DN_ID_OLD_EXIST;
1360 1385          if (error == 0 && !before)
1361 1386                  dn->dn_id_flags |= DN_ID_NEW_EXIST;
1362 1387  
1363 1388          if (have_spill) {
1364 1389                  dn->dn_id_flags |= DN_ID_CHKED_SPILL;
1365 1390          } else {
1366 1391                  dn->dn_id_flags |= DN_ID_CHKED_BONUS;
1367 1392          }
1368 1393          mutex_exit(&dn->dn_mtx);
1369 1394          if (have_spill)
1370 1395                  dmu_buf_rele((dmu_buf_t *)db, FTAG);
1371 1396  }
1372 1397  
1373 1398  boolean_t
1374 1399  dmu_objset_userspace_present(objset_t *os)
1375 1400  {
1376 1401          return (os->os_phys->os_flags &
1377 1402              OBJSET_FLAG_USERACCOUNTING_COMPLETE);
1378 1403  }
1379 1404  
1380 1405  int
1381 1406  dmu_objset_userspace_upgrade(objset_t *os)
1382 1407  {
1383 1408          uint64_t obj;
1384 1409          int err = 0;
1385 1410  
1386 1411          if (dmu_objset_userspace_present(os))
1387 1412                  return (0);
1388 1413          if (!dmu_objset_userused_enabled(os))
1389 1414                  return (SET_ERROR(ENOTSUP));
1390 1415          if (dmu_objset_is_snapshot(os))
1391 1416                  return (SET_ERROR(EINVAL));
1392 1417  
1393 1418          /*
1394 1419           * We simply need to mark every object dirty, so that it will be
1395 1420           * synced out and now accounted.  If this is called
1396 1421           * concurrently, or if we already did some work before crashing,
1397 1422           * that's fine, since we track each object's accounted state
1398 1423           * independently.
1399 1424           */
1400 1425  
1401 1426          for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
1402 1427                  dmu_tx_t *tx;
1403 1428                  dmu_buf_t *db;
1404 1429                  int objerr;
1405 1430  
1406 1431                  if (issig(JUSTLOOKING) && issig(FORREAL))
1407 1432                          return (SET_ERROR(EINTR));
1408 1433  
1409 1434                  objerr = dmu_bonus_hold(os, obj, FTAG, &db);
1410 1435                  if (objerr != 0)
1411 1436                          continue;
1412 1437                  tx = dmu_tx_create(os);
1413 1438                  dmu_tx_hold_bonus(tx, obj);
1414 1439                  objerr = dmu_tx_assign(tx, TXG_WAIT);
1415 1440                  if (objerr != 0) {
1416 1441                          dmu_tx_abort(tx);
1417 1442                          continue;
1418 1443                  }
1419 1444                  dmu_buf_will_dirty(db, tx);
1420 1445                  dmu_buf_rele(db, FTAG);
1421 1446                  dmu_tx_commit(tx);
1422 1447          }
1423 1448  
1424 1449          os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
1425 1450          txg_wait_synced(dmu_objset_pool(os), 0);
1426 1451          return (0);
1427 1452  }
1428 1453  
1429 1454  void
1430 1455  dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
1431 1456      uint64_t *usedobjsp, uint64_t *availobjsp)
1432 1457  {
1433 1458          dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
1434 1459              usedobjsp, availobjsp);
1435 1460  }
1436 1461  
1437 1462  uint64_t
1438 1463  dmu_objset_fsid_guid(objset_t *os)
1439 1464  {
1440 1465          return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
1441 1466  }
1442 1467  
1443 1468  void
1444 1469  dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
1445 1470  {
1446 1471          stat->dds_type = os->os_phys->os_type;
1447 1472          if (os->os_dsl_dataset)
1448 1473                  dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
1449 1474  }
1450 1475  
1451 1476  void
1452 1477  dmu_objset_stats(objset_t *os, nvlist_t *nv)
1453 1478  {
1454 1479          ASSERT(os->os_dsl_dataset ||
1455 1480              os->os_phys->os_type == DMU_OST_META);
1456 1481  
1457 1482          if (os->os_dsl_dataset != NULL)
1458 1483                  dsl_dataset_stats(os->os_dsl_dataset, nv);
1459 1484  
  
    | 
      ↓ open down ↓ | 
    543 lines elided | 
    
      ↑ open up ↑ | 
  
1460 1485          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
1461 1486              os->os_phys->os_type);
1462 1487          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
1463 1488              dmu_objset_userspace_present(os));
1464 1489  }
1465 1490  
1466 1491  int
1467 1492  dmu_objset_is_snapshot(objset_t *os)
1468 1493  {
1469 1494          if (os->os_dsl_dataset != NULL)
1470      -                return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
     1495 +                return (os->os_dsl_dataset->ds_is_snapshot);
1471 1496          else
1472 1497                  return (B_FALSE);
1473 1498  }
1474 1499  
1475 1500  int
1476 1501  dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
1477 1502      boolean_t *conflict)
1478 1503  {
1479 1504          dsl_dataset_t *ds = os->os_dsl_dataset;
1480 1505          uint64_t ignored;
1481 1506  
1482 1507          if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1483 1508                  return (SET_ERROR(ENOENT));
1484 1509  
1485 1510          return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
1486 1511              dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
1487 1512              MT_FIRST, real, maxlen, conflict));
1488 1513  }
1489 1514  
1490 1515  int
1491 1516  dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
1492 1517      uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
1493 1518  {
1494 1519          dsl_dataset_t *ds = os->os_dsl_dataset;
1495 1520          zap_cursor_t cursor;
1496 1521          zap_attribute_t attr;
1497 1522  
1498 1523          ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
1499 1524  
1500 1525          if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
1501 1526                  return (SET_ERROR(ENOENT));
1502 1527  
1503 1528          zap_cursor_init_serialized(&cursor,
1504 1529              ds->ds_dir->dd_pool->dp_meta_objset,
1505 1530              dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
1506 1531  
1507 1532          if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1508 1533                  zap_cursor_fini(&cursor);
1509 1534                  return (SET_ERROR(ENOENT));
1510 1535          }
1511 1536  
1512 1537          if (strlen(attr.za_name) + 1 > namelen) {
1513 1538                  zap_cursor_fini(&cursor);
1514 1539                  return (SET_ERROR(ENAMETOOLONG));
1515 1540          }
1516 1541  
1517 1542          (void) strcpy(name, attr.za_name);
1518 1543          if (idp)
1519 1544                  *idp = attr.za_first_integer;
1520 1545          if (case_conflict)
1521 1546                  *case_conflict = attr.za_normalization_conflict;
1522 1547          zap_cursor_advance(&cursor);
1523 1548          *offp = zap_cursor_serialize(&cursor);
1524 1549          zap_cursor_fini(&cursor);
1525 1550  
1526 1551          return (0);
1527 1552  }
1528 1553  
1529 1554  int
1530 1555  dmu_dir_list_next(objset_t *os, int namelen, char *name,
1531 1556      uint64_t *idp, uint64_t *offp)
1532 1557  {
1533 1558          dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
1534 1559          zap_cursor_t cursor;
1535 1560          zap_attribute_t attr;
1536 1561  
1537 1562          /* there is no next dir on a snapshot! */
1538 1563          if (os->os_dsl_dataset->ds_object !=
1539 1564              dsl_dir_phys(dd)->dd_head_dataset_obj)
1540 1565                  return (SET_ERROR(ENOENT));
1541 1566  
1542 1567          zap_cursor_init_serialized(&cursor,
1543 1568              dd->dd_pool->dp_meta_objset,
1544 1569              dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
1545 1570  
1546 1571          if (zap_cursor_retrieve(&cursor, &attr) != 0) {
1547 1572                  zap_cursor_fini(&cursor);
1548 1573                  return (SET_ERROR(ENOENT));
1549 1574          }
1550 1575  
1551 1576          if (strlen(attr.za_name) + 1 > namelen) {
1552 1577                  zap_cursor_fini(&cursor);
1553 1578                  return (SET_ERROR(ENAMETOOLONG));
1554 1579          }
1555 1580  
1556 1581          (void) strcpy(name, attr.za_name);
1557 1582          if (idp)
1558 1583                  *idp = attr.za_first_integer;
1559 1584          zap_cursor_advance(&cursor);
1560 1585          *offp = zap_cursor_serialize(&cursor);
1561 1586          zap_cursor_fini(&cursor);
1562 1587  
1563 1588          return (0);
1564 1589  }
1565 1590  
1566 1591  /*
1567 1592   * Find objsets under and including ddobj, call func(ds) on each.
1568 1593   */
1569 1594  int
1570 1595  dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
1571 1596      int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
1572 1597  {
1573 1598          dsl_dir_t *dd;
1574 1599          dsl_dataset_t *ds;
1575 1600          zap_cursor_t zc;
1576 1601          zap_attribute_t *attr;
1577 1602          uint64_t thisobj;
1578 1603          int err;
1579 1604  
1580 1605          ASSERT(dsl_pool_config_held(dp));
1581 1606  
1582 1607          err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd);
1583 1608          if (err != 0)
1584 1609                  return (err);
1585 1610  
1586 1611          /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1587 1612          if (dd->dd_myname[0] == '$') {
1588 1613                  dsl_dir_rele(dd, FTAG);
1589 1614                  return (0);
1590 1615          }
1591 1616  
1592 1617          thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1593 1618          attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1594 1619  
1595 1620          /*
1596 1621           * Iterate over all children.
1597 1622           */
1598 1623          if (flags & DS_FIND_CHILDREN) {
1599 1624                  for (zap_cursor_init(&zc, dp->dp_meta_objset,
1600 1625                      dsl_dir_phys(dd)->dd_child_dir_zapobj);
1601 1626                      zap_cursor_retrieve(&zc, attr) == 0;
1602 1627                      (void) zap_cursor_advance(&zc)) {
1603 1628                          ASSERT3U(attr->za_integer_length, ==,
1604 1629                              sizeof (uint64_t));
1605 1630                          ASSERT3U(attr->za_num_integers, ==, 1);
1606 1631  
1607 1632                          err = dmu_objset_find_dp(dp, attr->za_first_integer,
1608 1633                              func, arg, flags);
1609 1634                          if (err != 0)
1610 1635                                  break;
1611 1636                  }
1612 1637                  zap_cursor_fini(&zc);
1613 1638  
1614 1639                  if (err != 0) {
1615 1640                          dsl_dir_rele(dd, FTAG);
1616 1641                          kmem_free(attr, sizeof (zap_attribute_t));
1617 1642                          return (err);
1618 1643                  }
1619 1644          }
1620 1645  
1621 1646          /*
1622 1647           * Iterate over all snapshots.
1623 1648           */
1624 1649          if (flags & DS_FIND_SNAPSHOTS) {
1625 1650                  dsl_dataset_t *ds;
1626 1651                  err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1627 1652  
1628 1653                  if (err == 0) {
1629 1654                          uint64_t snapobj;
1630 1655  
1631 1656                          snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1632 1657                          dsl_dataset_rele(ds, FTAG);
1633 1658  
1634 1659                          for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1635 1660                              zap_cursor_retrieve(&zc, attr) == 0;
1636 1661                              (void) zap_cursor_advance(&zc)) {
1637 1662                                  ASSERT3U(attr->za_integer_length, ==,
1638 1663                                      sizeof (uint64_t));
1639 1664                                  ASSERT3U(attr->za_num_integers, ==, 1);
1640 1665  
1641 1666                                  err = dsl_dataset_hold_obj(dp,
1642 1667                                      attr->za_first_integer, FTAG, &ds);
1643 1668                                  if (err != 0)
1644 1669                                          break;
1645 1670                                  err = func(dp, ds, arg);
1646 1671                                  dsl_dataset_rele(ds, FTAG);
1647 1672                                  if (err != 0)
1648 1673                                          break;
1649 1674                          }
1650 1675                          zap_cursor_fini(&zc);
1651 1676                  }
1652 1677          }
1653 1678  
1654 1679          dsl_dir_rele(dd, FTAG);
1655 1680          kmem_free(attr, sizeof (zap_attribute_t));
1656 1681  
1657 1682          if (err != 0)
1658 1683                  return (err);
1659 1684  
1660 1685          /*
1661 1686           * Apply to self.
1662 1687           */
1663 1688          err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1664 1689          if (err != 0)
1665 1690                  return (err);
1666 1691          err = func(dp, ds, arg);
1667 1692          dsl_dataset_rele(ds, FTAG);
1668 1693          return (err);
1669 1694  }
1670 1695  
1671 1696  /*
1672 1697   * Find all objsets under name, and for each, call 'func(child_name, arg)'.
1673 1698   * The dp_config_rwlock must not be held when this is called, and it
1674 1699   * will not be held when the callback is called.
1675 1700   * Therefore this function should only be used when the pool is not changing
1676 1701   * (e.g. in syncing context), or the callback can deal with the possible races.
1677 1702   */
1678 1703  static int
1679 1704  dmu_objset_find_impl(spa_t *spa, const char *name,
1680 1705      int func(const char *, void *), void *arg, int flags)
1681 1706  {
1682 1707          dsl_dir_t *dd;
1683 1708          dsl_pool_t *dp = spa_get_dsl(spa);
1684 1709          dsl_dataset_t *ds;
1685 1710          zap_cursor_t zc;
1686 1711          zap_attribute_t *attr;
1687 1712          char *child;
1688 1713          uint64_t thisobj;
1689 1714          int err;
1690 1715  
1691 1716          dsl_pool_config_enter(dp, FTAG);
1692 1717  
1693 1718          err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
1694 1719          if (err != 0) {
1695 1720                  dsl_pool_config_exit(dp, FTAG);
1696 1721                  return (err);
1697 1722          }
1698 1723  
1699 1724          /* Don't visit hidden ($MOS & $ORIGIN) objsets. */
1700 1725          if (dd->dd_myname[0] == '$') {
1701 1726                  dsl_dir_rele(dd, FTAG);
1702 1727                  dsl_pool_config_exit(dp, FTAG);
1703 1728                  return (0);
1704 1729          }
1705 1730  
1706 1731          thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
1707 1732          attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
1708 1733  
1709 1734          /*
1710 1735           * Iterate over all children.
1711 1736           */
1712 1737          if (flags & DS_FIND_CHILDREN) {
1713 1738                  for (zap_cursor_init(&zc, dp->dp_meta_objset,
1714 1739                      dsl_dir_phys(dd)->dd_child_dir_zapobj);
1715 1740                      zap_cursor_retrieve(&zc, attr) == 0;
1716 1741                      (void) zap_cursor_advance(&zc)) {
1717 1742                          ASSERT3U(attr->za_integer_length, ==,
1718 1743                              sizeof (uint64_t));
1719 1744                          ASSERT3U(attr->za_num_integers, ==, 1);
1720 1745  
1721 1746                          child = kmem_asprintf("%s/%s", name, attr->za_name);
1722 1747                          dsl_pool_config_exit(dp, FTAG);
1723 1748                          err = dmu_objset_find_impl(spa, child,
1724 1749                              func, arg, flags);
1725 1750                          dsl_pool_config_enter(dp, FTAG);
1726 1751                          strfree(child);
1727 1752                          if (err != 0)
1728 1753                                  break;
1729 1754                  }
1730 1755                  zap_cursor_fini(&zc);
1731 1756  
1732 1757                  if (err != 0) {
1733 1758                          dsl_dir_rele(dd, FTAG);
1734 1759                          dsl_pool_config_exit(dp, FTAG);
1735 1760                          kmem_free(attr, sizeof (zap_attribute_t));
1736 1761                          return (err);
1737 1762                  }
1738 1763          }
1739 1764  
1740 1765          /*
1741 1766           * Iterate over all snapshots.
1742 1767           */
1743 1768          if (flags & DS_FIND_SNAPSHOTS) {
1744 1769                  err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
1745 1770  
1746 1771                  if (err == 0) {
1747 1772                          uint64_t snapobj;
1748 1773  
1749 1774                          snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
1750 1775                          dsl_dataset_rele(ds, FTAG);
1751 1776  
1752 1777                          for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
1753 1778                              zap_cursor_retrieve(&zc, attr) == 0;
1754 1779                              (void) zap_cursor_advance(&zc)) {
1755 1780                                  ASSERT3U(attr->za_integer_length, ==,
1756 1781                                      sizeof (uint64_t));
1757 1782                                  ASSERT3U(attr->za_num_integers, ==, 1);
1758 1783  
1759 1784                                  child = kmem_asprintf("%s@%s",
1760 1785                                      name, attr->za_name);
1761 1786                                  dsl_pool_config_exit(dp, FTAG);
1762 1787                                  err = func(child, arg);
1763 1788                                  dsl_pool_config_enter(dp, FTAG);
1764 1789                                  strfree(child);
1765 1790                                  if (err != 0)
1766 1791                                          break;
1767 1792                          }
1768 1793                          zap_cursor_fini(&zc);
1769 1794                  }
1770 1795          }
1771 1796  
1772 1797          dsl_dir_rele(dd, FTAG);
1773 1798          kmem_free(attr, sizeof (zap_attribute_t));
1774 1799          dsl_pool_config_exit(dp, FTAG);
1775 1800  
1776 1801          if (err != 0)
1777 1802                  return (err);
1778 1803  
1779 1804          /* Apply to self. */
1780 1805          return (func(name, arg));
1781 1806  }
1782 1807  
1783 1808  /*
1784 1809   * See comment above dmu_objset_find_impl().
1785 1810   */
1786 1811  int
1787 1812  dmu_objset_find(char *name, int func(const char *, void *), void *arg,
1788 1813      int flags)
1789 1814  {
1790 1815          spa_t *spa;
1791 1816          int error;
1792 1817  
1793 1818          error = spa_open(name, &spa, FTAG);
1794 1819          if (error != 0)
1795 1820                  return (error);
1796 1821          error = dmu_objset_find_impl(spa, name, func, arg, flags);
1797 1822          spa_close(spa, FTAG);
1798 1823          return (error);
1799 1824  }
1800 1825  
1801 1826  void
1802 1827  dmu_objset_set_user(objset_t *os, void *user_ptr)
1803 1828  {
1804 1829          ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
1805 1830          os->os_user_ptr = user_ptr;
1806 1831  }
1807 1832  
1808 1833  void *
1809 1834  dmu_objset_get_user(objset_t *os)
1810 1835  {
1811 1836          ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
1812 1837          return (os->os_user_ptr);
1813 1838  }
1814 1839  
1815 1840  /*
1816 1841   * Determine name of filesystem, given name of snapshot.
1817 1842   * buf must be at least MAXNAMELEN bytes
1818 1843   */
1819 1844  int
1820 1845  dmu_fsname(const char *snapname, char *buf)
1821 1846  {
1822 1847          char *atp = strchr(snapname, '@');
1823 1848          if (atp == NULL)
1824 1849                  return (SET_ERROR(EINVAL));
1825 1850          if (atp - snapname >= MAXNAMELEN)
1826 1851                  return (SET_ERROR(ENAMETOOLONG));
1827 1852          (void) strlcpy(buf, snapname, atp - snapname + 1);
1828 1853          return (0);
1829 1854  }
  
    | 
      ↓ open down ↓ | 
    349 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX