Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dmu_objset.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_objset.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24   24   * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  25   25   * Copyright (c) 2013, Joyent, Inc. All rights reserved.
       26 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  26   27   */
  27   28  
  28   29  /* Portions Copyright 2010 Robert Milkowski */
  29   30  
  30   31  #include <sys/cred.h>
  31   32  #include <sys/zfs_context.h>
  32   33  #include <sys/dmu_objset.h>
  33   34  #include <sys/dsl_dir.h>
  34   35  #include <sys/dsl_dataset.h>
  35   36  #include <sys/dsl_prop.h>
↓ open down ↓ 312 lines elided ↑ open up ↑
 348  349           */
 349  350          if (ds != NULL) {
 350  351                  err = dsl_prop_register(ds,
 351  352                      zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 352  353                      primary_cache_changed_cb, os);
 353  354                  if (err == 0) {
 354  355                          err = dsl_prop_register(ds,
 355  356                              zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 356  357                              secondary_cache_changed_cb, os);
 357  358                  }
 358      -                if (!dsl_dataset_is_snapshot(ds)) {
      359 +                if (!ds->ds_is_snapshot) {
 359  360                          if (err == 0) {
 360  361                                  err = dsl_prop_register(ds,
 361  362                                      zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 362  363                                      checksum_changed_cb, os);
 363  364                          }
 364  365                          if (err == 0) {
 365  366                                  err = dsl_prop_register(ds,
 366  367                                      zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 367  368                                      compression_changed_cb, os);
 368  369                          }
↓ open down ↓ 41 lines elided ↑ open up ↑
 410  411                  os->os_compress = ZIO_COMPRESS_LZJB;
 411  412                  os->os_copies = spa_max_replication(spa);
 412  413                  os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 413  414                  os->os_dedup_verify = B_FALSE;
 414  415                  os->os_logbias = ZFS_LOGBIAS_LATENCY;
 415  416                  os->os_sync = ZFS_SYNC_STANDARD;
 416  417                  os->os_primary_cache = ZFS_CACHE_ALL;
 417  418                  os->os_secondary_cache = ZFS_CACHE_ALL;
 418  419          }
 419  420  
 420      -        if (ds == NULL || !dsl_dataset_is_snapshot(ds))
      421 +        if (ds == NULL || !ds->ds_is_snapshot)
 421  422                  os->os_zil_header = os->os_phys->os_zil_header;
 422  423          os->os_zil = zil_alloc(os, &os->os_zil_header);
 423  424  
 424  425          for (i = 0; i < TXG_SIZE; i++) {
 425  426                  list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 426  427                      offsetof(dnode_t, dn_dirty_link[i]));
 427  428                  list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 428  429                      offsetof(dnode_t, dn_dirty_link[i]));
 429  430          }
 430  431          list_create(&os->os_dnodes, sizeof (dnode_t),
 431  432              offsetof(dnode_t, dn_link));
 432  433          list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 433  434              offsetof(dmu_buf_impl_t, db_link));
 434  435  
 435  436          mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 436  437          mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 437  438          mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 438  439  
 439      -        DMU_META_DNODE(os) = dnode_special_open(os,
 440      -            &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT,
 441      -            &os->os_meta_dnode);
      440 +        dnode_special_open(os, &os->os_phys->os_meta_dnode,
      441 +            DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 442  442          if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 443      -                DMU_USERUSED_DNODE(os) = dnode_special_open(os,
 444      -                    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT,
 445      -                    &os->os_userused_dnode);
 446      -                DMU_GROUPUSED_DNODE(os) = dnode_special_open(os,
 447      -                    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT,
 448      -                    &os->os_groupused_dnode);
      443 +                dnode_special_open(os, &os->os_phys->os_userused_dnode,
      444 +                    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
      445 +                dnode_special_open(os, &os->os_phys->os_groupused_dnode,
      446 +                    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 449  447          }
 450  448  
 451  449          *osp = os;
 452  450          return (0);
 453  451  }
 454  452  
 455  453  int
 456  454  dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 457  455  {
 458  456          int err = 0;
↓ open down ↓ 67 lines elided ↑ open up ↑
 526  524                  return (err);
 527  525          }
 528  526  
 529  527          err = dmu_objset_from_ds(ds, osp);
 530  528          dsl_pool_rele(dp, FTAG);
 531  529          if (err != 0) {
 532  530                  dsl_dataset_disown(ds, tag);
 533  531          } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 534  532                  dsl_dataset_disown(ds, tag);
 535  533                  return (SET_ERROR(EINVAL));
 536      -        } else if (!readonly && dsl_dataset_is_snapshot(ds)) {
      534 +        } else if (!readonly && ds->ds_is_snapshot) {
 537  535                  dsl_dataset_disown(ds, tag);
 538  536                  return (SET_ERROR(EROFS));
 539  537          }
 540  538          return (err);
 541  539  }
 542  540  
 543  541  void
 544  542  dmu_objset_rele(objset_t *os, void *tag)
 545  543  {
 546  544          dsl_pool_t *dp = dmu_objset_pool(os);
↓ open down ↓ 35 lines elided ↑ open up ↑
 582  580  
 583  581  void
 584  582  dmu_objset_disown(objset_t *os, void *tag)
 585  583  {
 586  584          dsl_dataset_disown(os->os_dsl_dataset, tag);
 587  585  }
 588  586  
 589  587  void
 590  588  dmu_objset_evict_dbufs(objset_t *os)
 591  589  {
      590 +        dnode_t dn_marker;
 592  591          dnode_t *dn;
 593  592  
 594  593          mutex_enter(&os->os_lock);
      594 +        dn = list_head(&os->os_dnodes);
      595 +        while (dn != NULL) {
      596 +                /*
      597 +                 * Skip dnodes without holds.  We have to do this dance
      598 +                 * because dnode_add_ref() only works if there is already a
      599 +                 * hold.  If the dnode has no holds, then it has no dbufs.
      600 +                 */
      601 +                if (dnode_add_ref(dn, FTAG)) {
      602 +                        list_insert_after(&os->os_dnodes, dn, &dn_marker);
      603 +                        mutex_exit(&os->os_lock);
 595  604  
 596      -        /* process the mdn last, since the other dnodes have holds on it */
 597      -        list_remove(&os->os_dnodes, DMU_META_DNODE(os));
 598      -        list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os));
      605 +                        dnode_evict_dbufs(dn);
      606 +                        dnode_rele(dn, FTAG);
 599  607  
 600      -        /*
 601      -         * Find the first dnode with holds.  We have to do this dance
 602      -         * because dnode_add_ref() only works if you already have a
 603      -         * hold.  If there are no holds then it has no dbufs so OK to
 604      -         * skip.
 605      -         */
 606      -        for (dn = list_head(&os->os_dnodes);
 607      -            dn && !dnode_add_ref(dn, FTAG);
 608      -            dn = list_next(&os->os_dnodes, dn))
 609      -                continue;
 610      -
 611      -        while (dn) {
 612      -                dnode_t *next_dn = dn;
 613      -
 614      -                do {
 615      -                        next_dn = list_next(&os->os_dnodes, next_dn);
 616      -                } while (next_dn && !dnode_add_ref(next_dn, FTAG));
 617      -
 618      -                mutex_exit(&os->os_lock);
 619      -                dnode_evict_dbufs(dn);
 620      -                dnode_rele(dn, FTAG);
 621      -                mutex_enter(&os->os_lock);
 622      -                dn = next_dn;
      608 +                        mutex_enter(&os->os_lock);
      609 +                        dn = list_next(&os->os_dnodes, &dn_marker);
      610 +                        list_remove(&os->os_dnodes, &dn_marker);
      611 +                } else {
      612 +                        dn = list_next(&os->os_dnodes, dn);
      613 +                }
 623  614          }
 624  615          mutex_exit(&os->os_lock);
      616 +
      617 +        if (DMU_USERUSED_DNODE(os) != NULL) {
      618 +                dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
      619 +                dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
      620 +        }
      621 +        dnode_evict_dbufs(DMU_META_DNODE(os));
 625  622  }
 626  623  
      624 +/*
      625 + * Objset eviction processing is split into into two pieces.
      626 + * The first marks the objset as evicting, evicts any dbufs that
      627 + * have a refcount of zero, and then queues up the objset for the
      628 + * second phase of eviction.  Once os->os_dnodes has been cleared by
      629 + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
      630 + * The second phase closes the special dnodes, dequeues the objset from
      631 + * the list of those undergoing eviction, and finally frees the objset.
      632 + *
      633 + * NOTE: Due to asynchronous eviction processing (invocation of
      634 + *       dnode_buf_pageout()), it is possible for the meta dnode for the
      635 + *       objset to have no holds even though os->os_dnodes is not empty.
      636 + */
 627  637  void
 628  638  dmu_objset_evict(objset_t *os)
 629  639  {
 630  640          dsl_dataset_t *ds = os->os_dsl_dataset;
 631  641  
 632  642          for (int t = 0; t < TXG_SIZE; t++)
 633  643                  ASSERT(!dmu_objset_is_dirty(os, t));
 634  644  
 635  645          if (ds) {
 636      -                if (!dsl_dataset_is_snapshot(ds)) {
      646 +                if (!ds->ds_is_snapshot) {
 637  647                          VERIFY0(dsl_prop_unregister(ds,
 638  648                              zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 639  649                              checksum_changed_cb, os));
 640  650                          VERIFY0(dsl_prop_unregister(ds,
 641  651                              zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 642  652                              compression_changed_cb, os));
 643  653                          VERIFY0(dsl_prop_unregister(ds,
 644  654                              zfs_prop_to_name(ZFS_PROP_COPIES),
 645  655                              copies_changed_cb, os));
 646  656                          VERIFY0(dsl_prop_unregister(ds,
↓ open down ↓ 16 lines elided ↑ open up ↑
 663  673                      zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 664  674                      primary_cache_changed_cb, os));
 665  675                  VERIFY0(dsl_prop_unregister(ds,
 666  676                      zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 667  677                      secondary_cache_changed_cb, os));
 668  678          }
 669  679  
 670  680          if (os->os_sa)
 671  681                  sa_tear_down(os);
 672  682  
      683 +        os->os_evicting = B_TRUE;
 673  684          dmu_objset_evict_dbufs(os);
 674  685  
      686 +        mutex_enter(&os->os_lock);
      687 +        spa_evicting_os_register(os->os_spa, os);
      688 +        if (list_is_empty(&os->os_dnodes)) {
      689 +                mutex_exit(&os->os_lock);
      690 +                dmu_objset_evict_done(os);
      691 +        } else {
      692 +                mutex_exit(&os->os_lock);
      693 +        }
      694 +}
      695 +
      696 +void
      697 +dmu_objset_evict_done(objset_t *os)
      698 +{
      699 +        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
      700 +
 675  701          dnode_special_close(&os->os_meta_dnode);
 676  702          if (DMU_USERUSED_DNODE(os)) {
 677  703                  dnode_special_close(&os->os_userused_dnode);
 678  704                  dnode_special_close(&os->os_groupused_dnode);
 679  705          }
 680  706          zil_free(os->os_zil);
 681  707  
 682      -        ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 683      -
 684  708          VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf));
 685  709  
 686  710          /*
 687  711           * This is a barrier to prevent the objset from going away in
 688  712           * dnode_move() until we can safely ensure that the objset is still in
 689  713           * use. We consider the objset valid before the barrier and invalid
 690  714           * after the barrier.
 691  715           */
 692  716          rw_enter(&os_lock, RW_READER);
 693  717          rw_exit(&os_lock);
 694  718  
 695  719          mutex_destroy(&os->os_lock);
 696  720          mutex_destroy(&os->os_obj_lock);
 697  721          mutex_destroy(&os->os_user_ptr_lock);
      722 +        spa_evicting_os_deregister(os->os_spa, os);
 698  723          kmem_free(os, sizeof (objset_t));
 699  724  }
 700  725  
 701  726  timestruc_t
 702  727  dmu_objset_snap_cmtime(objset_t *os)
 703  728  {
 704  729          return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 705  730  }
 706  731  
 707  732  /* called from dsl for meta-objset */
↓ open down ↓ 188 lines elided ↑ open up ↑
 896  921          if (error != 0)
 897  922                  return (error);
 898  923  
 899  924          /* You can't clone across pools. */
 900  925          if (origin->ds_dir->dd_pool != dp) {
 901  926                  dsl_dataset_rele(origin, FTAG);
 902  927                  return (SET_ERROR(EXDEV));
 903  928          }
 904  929  
 905  930          /* You can only clone snapshots, not the head datasets. */
 906      -        if (!dsl_dataset_is_snapshot(origin)) {
      931 +        if (!origin->ds_is_snapshot) {
 907  932                  dsl_dataset_rele(origin, FTAG);
 908  933                  return (SET_ERROR(EINVAL));
 909  934          }
 910  935          dsl_dataset_rele(origin, FTAG);
 911  936  
 912  937          return (0);
 913  938  }
 914  939  
 915  940  static void
 916  941  dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
↓ open down ↓ 543 lines elided ↑ open up ↑
1460 1485          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
1461 1486              os->os_phys->os_type);
1462 1487          dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
1463 1488              dmu_objset_userspace_present(os));
1464 1489  }
1465 1490  
1466 1491  int
1467 1492  dmu_objset_is_snapshot(objset_t *os)
1468 1493  {
1469 1494          if (os->os_dsl_dataset != NULL)
1470      -                return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
     1495 +                return (os->os_dsl_dataset->ds_is_snapshot);
1471 1496          else
1472 1497                  return (B_FALSE);
1473 1498  }
1474 1499  
1475 1500  int
1476 1501  dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
1477 1502      boolean_t *conflict)
1478 1503  {
1479 1504          dsl_dataset_t *ds = os->os_dsl_dataset;
1480 1505          uint64_t ignored;
↓ open down ↓ 349 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX