Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/fs/zfs/dnode.c
          +++ new/usr/src/uts/common/fs/zfs/dnode.c
↓ open down ↓ 13 lines elided ↑ open up ↑
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
       24 + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  24   25   */
  25   26  
  26   27  #include <sys/zfs_context.h>
  27   28  #include <sys/dbuf.h>
  28   29  #include <sys/dnode.h>
  29   30  #include <sys/dmu.h>
  30   31  #include <sys/dmu_impl.h>
  31   32  #include <sys/dmu_tx.h>
  32   33  #include <sys/dmu_objset.h>
  33   34  #include <sys/dsl_dir.h>
↓ open down ↓ 361 lines elided ↑ open up ↑
 395  396              1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8));
 396  397          dn->dn_datablksz = size;
 397  398          dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT;
 398  399          dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0;
 399  400  }
 400  401  
 401  402  static dnode_t *
 402  403  dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 403  404      uint64_t object, dnode_handle_t *dnh)
 404  405  {
 405      -        dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
      406 +        dnode_t *dn;
 406  407  
      408 +        dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
 407  409          ASSERT(!POINTER_IS_VALID(dn->dn_objset));
 408  410          dn->dn_moved = 0;
 409  411  
 410  412          /*
 411  413           * Defer setting dn_objset until the dnode is ready to be a candidate
 412  414           * for the dnode_move() callback.
 413  415           */
 414  416          dn->dn_object = object;
 415  417          dn->dn_dbuf = db;
 416  418          dn->dn_handle = dnh;
↓ open down ↓ 16 lines elided ↑ open up ↑
 433  435          dn->dn_bonuslen = dnp->dn_bonuslen;
 434  436          dn->dn_maxblkid = dnp->dn_maxblkid;
 435  437          dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0);
 436  438          dn->dn_id_flags = 0;
 437  439  
 438  440          dmu_zfetch_init(&dn->dn_zfetch, dn);
 439  441  
 440  442          ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type));
 441  443  
 442  444          mutex_enter(&os->os_lock);
 443      -        list_insert_head(&os->os_dnodes, dn);
      445 +        if (dnh->dnh_dnode != NULL) {
      446 +                /* Lost the allocation race. */
      447 +                mutex_exit(&os->os_lock);
      448 +                kmem_cache_free(dnode_cache, dn);
      449 +                return (dnh->dnh_dnode);
      450 +        }
      451 +
      452 +        /*
      453 +         * Exclude special dnodes from os_dnodes so an empty os_dnodes
      454 +         * signifies that the special dnodes have no references from
      455 +         * their children (the entries in os_dnodes).  This allows
      456 +         * dnode_destroy() to easily determine if the last child has
      457 +         * been removed and then complete eviction of the objset.
      458 +         */
      459 +        if (!DMU_OBJECT_IS_SPECIAL(object))
      460 +                list_insert_head(&os->os_dnodes, dn);
 444  461          membar_producer();
      462 +
 445  463          /*
 446      -         * Everything else must be valid before assigning dn_objset makes the
 447      -         * dnode eligible for dnode_move().
      464 +         * Everything else must be valid before assigning dn_objset
      465 +         * makes the dnode eligible for dnode_move().
 448  466           */
 449  467          dn->dn_objset = os;
      468 +
      469 +        dnh->dnh_dnode = dn;
 450  470          mutex_exit(&os->os_lock);
 451  471  
 452  472          arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 453  473          return (dn);
 454  474  }
 455  475  
 456  476  /*
 457  477   * Caller must be holding the dnode handle, which is released upon return.
 458  478   */
 459  479  static void
 460  480  dnode_destroy(dnode_t *dn)
 461  481  {
 462  482          objset_t *os = dn->dn_objset;
      483 +        boolean_t complete_os_eviction = B_FALSE;
 463  484  
 464  485          ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0);
 465  486  
 466  487          mutex_enter(&os->os_lock);
 467  488          POINTER_INVALIDATE(&dn->dn_objset);
 468      -        list_remove(&os->os_dnodes, dn);
      489 +        if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
      490 +                list_remove(&os->os_dnodes, dn);
      491 +                complete_os_eviction =
      492 +                    list_is_empty(&os->os_dnodes) &&
      493 +                    list_link_active(&os->os_evicting_node);
      494 +        }
 469  495          mutex_exit(&os->os_lock);
 470  496  
 471  497          /* the dnode can no longer move, so we can release the handle */
 472  498          zrl_remove(&dn->dn_handle->dnh_zrlock);
 473  499  
 474  500          dn->dn_allocated_txg = 0;
 475  501          dn->dn_free_txg = 0;
 476  502          dn->dn_assigned_txg = 0;
 477  503  
 478  504          dn->dn_dirtyctx = 0;
↓ open down ↓ 14 lines elided ↑ open up ↑
 493  519          dn->dn_olduid = 0;
 494  520          dn->dn_oldgid = 0;
 495  521          dn->dn_newuid = 0;
 496  522          dn->dn_newgid = 0;
 497  523          dn->dn_id_flags = 0;
 498  524          dn->dn_unlisted_l0_blkid = 0;
 499  525  
 500  526          dmu_zfetch_rele(&dn->dn_zfetch);
 501  527          kmem_cache_free(dnode_cache, dn);
 502  528          arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
      529 +
      530 +        if (complete_os_eviction)
      531 +                dmu_objset_evict_done(os);
 503  532  }
 504  533  
 505  534  void
 506  535  dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
 507  536      dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 508  537  {
 509  538          int i;
 510  539  
 511  540          ASSERT3U(blocksize, <=,
 512  541              spa_maxblocksize(dmu_objset_spa(dn->dn_objset)));
↓ open down ↓ 446 lines elided ↑ open up ↑
 959  988          dnode_t *dn = dnh->dnh_dnode;
 960  989  
 961  990          /*
 962  991           * Wait for final references to the dnode to clear.  This can
 963  992           * only happen if the arc is asyncronously evicting state that
 964  993           * has a hold on this dnode while we are trying to evict this
 965  994           * dnode.
 966  995           */
 967  996          while (refcount_count(&dn->dn_holds) > 0)
 968  997                  delay(1);
      998 +        ASSERT(dn->dn_dbuf == NULL ||
      999 +            dmu_buf_get_user(&dn->dn_dbuf->db) == NULL);
 969 1000          zrl_add(&dnh->dnh_zrlock);
 970 1001          dnode_destroy(dn); /* implicit zrl_remove() */
 971 1002          zrl_destroy(&dnh->dnh_zrlock);
 972 1003          dnh->dnh_dnode = NULL;
 973 1004  }
 974 1005  
 975      -dnode_t *
     1006 +void
 976 1007  dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object,
 977 1008      dnode_handle_t *dnh)
 978 1009  {
 979      -        dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh);
 980      -        dnh->dnh_dnode = dn;
     1010 +        dnode_t *dn;
     1011 +
     1012 +        dn = dnode_create(os, dnp, NULL, object, dnh);
 981 1013          zrl_init(&dnh->dnh_zrlock);
 982 1014          DNODE_VERIFY(dn);
 983      -        return (dn);
 984 1015  }
 985 1016  
 986 1017  static void
 987      -dnode_buf_pageout(dmu_buf_t *db, void *arg)
     1018 +dnode_buf_pageout(void *dbu)
 988 1019  {
 989      -        dnode_children_t *children_dnodes = arg;
     1020 +        dnode_children_t *children_dnodes = dbu;
 990 1021          int i;
 991      -        int epb = db->db_size >> DNODE_SHIFT;
 992 1022  
 993      -        ASSERT(epb == children_dnodes->dnc_count);
 994      -
 995      -        for (i = 0; i < epb; i++) {
     1023 +        for (i = 0; i < children_dnodes->dnc_count; i++) {
 996 1024                  dnode_handle_t *dnh = &children_dnodes->dnc_children[i];
 997 1025                  dnode_t *dn;
 998 1026  
 999 1027                  /*
1000 1028                   * The dnode handle lock guards against the dnode moving to
1001 1029                   * another valid address, so there is no need here to guard
1002 1030                   * against changes to or from NULL.
1003 1031                   */
1004 1032                  if (dnh->dnh_dnode == NULL) {
1005 1033                          zrl_destroy(&dnh->dnh_zrlock);
↓ open down ↓ 9 lines elided ↑ open up ↑
1015 1043                   * would not have been called.
1016 1044                   */
1017 1045                  ASSERT(refcount_is_zero(&dn->dn_holds));
1018 1046                  ASSERT(refcount_is_zero(&dn->dn_tx_holds));
1019 1047  
1020 1048                  dnode_destroy(dn); /* implicit zrl_remove() */
1021 1049                  zrl_destroy(&dnh->dnh_zrlock);
1022 1050                  dnh->dnh_dnode = NULL;
1023 1051          }
1024 1052          kmem_free(children_dnodes, sizeof (dnode_children_t) +
1025      -            epb * sizeof (dnode_handle_t));
     1053 +            children_dnodes->dnc_count * sizeof (dnode_handle_t));
1026 1054  }
1027 1055  
1028 1056  /*
1029 1057   * errors:
1030 1058   * EINVAL - invalid object number.
1031 1059   * EIO - i/o error.
1032 1060   * succeeds even for free dnodes.
1033 1061   */
1034 1062  int
1035 1063  dnode_hold_impl(objset_t *os, uint64_t object, int flag,
↓ open down ↓ 63 lines elided ↑ open up ↑
1099 1127          ASSERT3U(db->db.db_size, >=, 1<<DNODE_SHIFT);
1100 1128          epb = db->db.db_size >> DNODE_SHIFT;
1101 1129  
1102 1130          idx = object & (epb-1);
1103 1131  
1104 1132          ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE);
1105 1133          children_dnodes = dmu_buf_get_user(&db->db);
1106 1134          if (children_dnodes == NULL) {
1107 1135                  int i;
1108 1136                  dnode_children_t *winner;
1109      -                children_dnodes = kmem_alloc(sizeof (dnode_children_t) +
     1137 +                children_dnodes = kmem_zalloc(sizeof (dnode_children_t) +
1110 1138                      epb * sizeof (dnode_handle_t), KM_SLEEP);
1111 1139                  children_dnodes->dnc_count = epb;
1112 1140                  dnh = &children_dnodes->dnc_children[0];
1113 1141                  for (i = 0; i < epb; i++) {
1114 1142                          zrl_init(&dnh[i].dnh_zrlock);
1115      -                        dnh[i].dnh_dnode = NULL;
1116 1143                  }
1117      -                if (winner = dmu_buf_set_user(&db->db, children_dnodes,
1118      -                    dnode_buf_pageout)) {
     1144 +                dmu_buf_init_user(&children_dnodes->dnc_dbu,
     1145 +                    dnode_buf_pageout, NULL);
     1146 +                winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu);
     1147 +                if (winner != NULL) {
1119 1148  
1120 1149                          for (i = 0; i < epb; i++) {
1121 1150                                  zrl_destroy(&dnh[i].dnh_zrlock);
1122 1151                          }
1123 1152  
1124 1153                          kmem_free(children_dnodes, sizeof (dnode_children_t) +
1125 1154                              epb * sizeof (dnode_handle_t));
1126 1155                          children_dnodes = winner;
1127 1156                  }
1128 1157          }
1129 1158          ASSERT(children_dnodes->dnc_count == epb);
1130 1159  
1131 1160          dnh = &children_dnodes->dnc_children[idx];
1132 1161          zrl_add(&dnh->dnh_zrlock);
1133      -        if ((dn = dnh->dnh_dnode) == NULL) {
     1162 +        dn = dnh->dnh_dnode;
     1163 +        if (dn == NULL) {
1134 1164                  dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx;
1135      -                dnode_t *winner;
1136 1165  
1137 1166                  dn = dnode_create(os, phys, db, object, dnh);
1138      -                winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn);
1139      -                if (winner != NULL) {
1140      -                        zrl_add(&dnh->dnh_zrlock);
1141      -                        dnode_destroy(dn); /* implicit zrl_remove() */
1142      -                        dn = winner;
1143      -                }
1144 1167          }
1145 1168  
1146 1169          mutex_enter(&dn->dn_mtx);
1147 1170          type = dn->dn_type;
1148 1171          if (dn->dn_free_txg ||
1149 1172              ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
1150 1173              ((flag & DNODE_MUST_BE_FREE) &&
1151 1174              (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) {
1152 1175                  mutex_exit(&dn->dn_mtx);
1153 1176                  zrl_remove(&dnh->dnh_zrlock);
1154 1177                  dbuf_rele(db, FTAG);
1155 1178                  return (type == DMU_OT_NONE ? ENOENT : EEXIST);
1156 1179          }
1157      -        mutex_exit(&dn->dn_mtx);
1158      -
1159 1180          if (refcount_add(&dn->dn_holds, tag) == 1)
1160 1181                  dbuf_add_ref(db, dnh);
     1182 +        mutex_exit(&dn->dn_mtx);
     1183 +
1161 1184          /* Now we can rely on the hold to prevent the dnode from moving. */
1162 1185          zrl_remove(&dnh->dnh_zrlock);
1163 1186  
1164 1187          DNODE_VERIFY(dn);
1165 1188          ASSERT3P(dn->dn_dbuf, ==, db);
1166 1189          ASSERT3U(dn->dn_object, ==, object);
1167 1190          dbuf_rele(db, FTAG);
1168 1191  
1169 1192          *dnp = dn;
1170 1193          return (0);
↓ open down ↓ 791 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX