Print this page
5056 ZFS deadlock on db_mtx and dn_holds
Reviewed by: Will Andrews <willa@spectralogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george.wilson@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>


   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  25  * Copyright (c) 2014 Joyent, Inc. All rights reserved.

  26  */
  27 
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_objset.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dsl_dataset.h>
  32 #include <sys/dsl_dir.h>
  33 #include <sys/dsl_prop.h>
  34 #include <sys/dsl_synctask.h>
  35 #include <sys/dsl_deleg.h>
  36 #include <sys/dmu_impl.h>
  37 #include <sys/spa.h>
  38 #include <sys/metaslab.h>
  39 #include <sys/zap.h>
  40 #include <sys/zio.h>
  41 #include <sys/arc.h>
  42 #include <sys/sunddi.h>
  43 #include <sys/zfeature.h>
  44 #include <sys/policy.h>
  45 #include <sys/zfs_znode.h>


 108  * For consistency, the filesystem limit is also not enforced if the user can
 109  * modify the limit.
 110  *
 111  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
 112  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
 113  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
 114  * dsl_dir_init_fs_ss_count().
 115  *
 116  * There is a special case when we receive a filesystem that already exists. In
 117  * this case a temporary clone name of %X is created (see dmu_recv_begin). We
 118  * never update the filesystem counts for temporary clones.
 119  *
 120  * Likewise, we do not update the snapshot counts for temporary snapshots,
 121  * such as those created by zfs diff.
 122  */
 123 
 124 extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
 125 
 126 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 127 
 128 /* ARGSUSED */
 129 static void
 130 dsl_dir_evict(dmu_buf_t *db, void *arg)
 131 {
 132         dsl_dir_t *dd = arg;
 133         dsl_pool_t *dp = dd->dd_pool;
 134         int t;
 135 


 136         for (t = 0; t < TXG_SIZE; t++) {
 137                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 138                 ASSERT(dd->dd_tempreserved[t] == 0);
 139                 ASSERT(dd->dd_space_towrite[t] == 0);
 140         }
 141 
 142         if (dd->dd_parent)
 143                 dsl_dir_rele(dd->dd_parent, dd);
 144 
 145         spa_close(dd->dd_pool->dp_spa, dd);
 146 
 147         /*
 148          * The props callback list should have been cleaned up by
 149          * objset_evict().
 150          */
 151         list_destroy(&dd->dd_prop_cbs);
 152         mutex_destroy(&dd->dd_lock);
 153         kmem_free(dd, sizeof (dsl_dir_t));
 154 }
 155 
 156 int
 157 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 158     const char *tail, void *tag, dsl_dir_t **ddp)
 159 {
 160         dmu_buf_t *dbuf;
 161         dsl_dir_t *dd;
 162         int err;
 163 
 164         ASSERT(dsl_pool_config_held(dp));
 165 


 221                 if (dsl_dir_is_clone(dd)) {
 222                         dmu_buf_t *origin_bonus;
 223                         dsl_dataset_phys_t *origin_phys;
 224 
 225                         /*
 226                          * We can't open the origin dataset, because
 227                          * that would require opening this dsl_dir.
 228                          * Just look at its phys directly instead.
 229                          */
 230                         err = dmu_bonus_hold(dp->dp_meta_objset,
 231                             dsl_dir_phys(dd)->dd_origin_obj, FTAG,
 232                             &origin_bonus);
 233                         if (err != 0)
 234                                 goto errout;
 235                         origin_phys = origin_bonus->db_data;
 236                         dd->dd_origin_txg =
 237                             origin_phys->ds_creation_txg;
 238                         dmu_buf_rele(origin_bonus, FTAG);
 239                 }
 240 
 241                 winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict);
 242                 if (winner) {

 243                         if (dd->dd_parent)
 244                                 dsl_dir_rele(dd->dd_parent, dd);
 245                         mutex_destroy(&dd->dd_lock);
 246                         kmem_free(dd, sizeof (dsl_dir_t));
 247                         dd = winner;
 248                 } else {
 249                         spa_open_ref(dp->dp_spa, dd);
 250                 }
 251         }
 252 
 253         /*
 254          * The dsl_dir_t has both open-to-close and instantiate-to-evict
 255          * holds on the spa.  We need the open-to-close holds because
 256          * otherwise the spa_refcnt wouldn't change when we open a
 257          * dir which the spa also has open, so we could incorrectly
 258          * think it was OK to unload/export/destroy the pool.  We need
 259          * the instantiate-to-evict hold because the dsl_dir_t has a
 260          * pointer to the dd_pool, which has a pointer to the spa_t.
 261          */
 262         spa_open_ref(dp->dp_spa, tag);


 266         *ddp = dd;
 267         return (0);
 268 
 269 errout:
 270         if (dd->dd_parent)
 271                 dsl_dir_rele(dd->dd_parent, dd);
 272         mutex_destroy(&dd->dd_lock);
 273         kmem_free(dd, sizeof (dsl_dir_t));
 274         dmu_buf_rele(dbuf, tag);
 275         return (err);
 276 }
 277 
 278 void
 279 dsl_dir_rele(dsl_dir_t *dd, void *tag)
 280 {
 281         dprintf_dd(dd, "%s\n", "");
 282         spa_close(dd->dd_pool->dp_spa, tag);
 283         dmu_buf_rele(dd->dd_dbuf, tag);
 284 }
 285 















 286 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 287 void
 288 dsl_dir_name(dsl_dir_t *dd, char *buf)
 289 {
 290         if (dd->dd_parent) {
 291                 dsl_dir_name(dd->dd_parent, buf);
 292                 (void) strcat(buf, "/");
 293         } else {
 294                 buf[0] = '\0';
 295         }
 296         if (!MUTEX_HELD(&dd->dd_lock)) {
 297                 /*
 298                  * recursive mutex so that we can use
 299                  * dprintf_dd() with dd_lock held
 300                  */
 301                 mutex_enter(&dd->dd_lock);
 302                 (void) strcat(buf, dd->dd_myname);
 303                 mutex_exit(&dd->dd_lock);
 304         } else {
 305                 (void) strcat(buf, dd->dd_myname);


 396         dsl_dir_t *dd;
 397         uint64_t ddobj;
 398 
 399         err = getcomponent(name, buf, &next);
 400         if (err != 0)
 401                 return (err);
 402 
 403         /* Make sure the name is in the specified pool. */
 404         spaname = spa_name(dp->dp_spa);
 405         if (strcmp(buf, spaname) != 0)
 406                 return (SET_ERROR(EINVAL));
 407 
 408         ASSERT(dsl_pool_config_held(dp));
 409 
 410         err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 411         if (err != 0) {
 412                 return (err);
 413         }
 414 
 415         while (next != NULL) {
 416                 dsl_dir_t *child_ds;
 417                 err = getcomponent(next, buf, &nextnext);
 418                 if (err != 0)
 419                         break;
 420                 ASSERT(next[0] != '\0');
 421                 if (next[0] == '@')
 422                         break;
 423                 dprintf("looking up %s in obj%lld\n",
 424                     buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 425 
 426                 err = zap_lookup(dp->dp_meta_objset,
 427                     dsl_dir_phys(dd)->dd_child_dir_zapobj,
 428                     buf, sizeof (ddobj), 1, &ddobj);
 429                 if (err != 0) {
 430                         if (err == ENOENT)
 431                                 err = 0;
 432                         break;
 433                 }
 434 
 435                 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds);
 436                 if (err != 0)
 437                         break;
 438                 dsl_dir_rele(dd, tag);
 439                 dd = child_ds;
 440                 next = nextnext;
 441         }
 442 
 443         if (err != 0) {
 444                 dsl_dir_rele(dd, tag);
 445                 return (err);
 446         }
 447 
 448         /*
 449          * It's an error if there's more than one component left, or
 450          * tailp==NULL and there's any component left.
 451          */
 452         if (next != NULL &&
 453             (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 454                 /* bad path name */
 455                 dsl_dir_rele(dd, tag);
 456                 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 457                 err = SET_ERROR(ENOENT);
 458         }
 459         if (tailp != NULL)




   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  24  * Copyright (c) 2013 Martin Matuska. All rights reserved.
  25  * Copyright (c) 2014 Joyent, Inc. All rights reserved.
  26  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  27  */
  28 
  29 #include <sys/dmu.h>
  30 #include <sys/dmu_objset.h>
  31 #include <sys/dmu_tx.h>
  32 #include <sys/dsl_dataset.h>
  33 #include <sys/dsl_dir.h>
  34 #include <sys/dsl_prop.h>
  35 #include <sys/dsl_synctask.h>
  36 #include <sys/dsl_deleg.h>
  37 #include <sys/dmu_impl.h>
  38 #include <sys/spa.h>
  39 #include <sys/metaslab.h>
  40 #include <sys/zap.h>
  41 #include <sys/zio.h>
  42 #include <sys/arc.h>
  43 #include <sys/sunddi.h>
  44 #include <sys/zfeature.h>
  45 #include <sys/policy.h>
  46 #include <sys/zfs_znode.h>


 109  * For consistency, the filesystem limit is also not enforced if the user can
 110  * modify the limit.
 111  *
 112  * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check()
 113  * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in
 114  * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by
 115  * dsl_dir_init_fs_ss_count().
 116  *
 117  * There is a special case when we receive a filesystem that already exists. In
 118  * this case a temporary clone name of %X is created (see dmu_recv_begin). We
 119  * never update the filesystem counts for temporary clones.
 120  *
 121  * Likewise, we do not update the snapshot counts for temporary snapshots,
 122  * such as those created by zfs diff.
 123  */
 124 
 125 extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd);
 126 
 127 static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd);
 128 

 129 static void
 130 dsl_dir_evict(void *dbu)
 131 {
 132         dsl_dir_t *dd = dbu;
 133         dsl_pool_t *dp = dd->dd_pool;
 134         int t;
 135 
 136         dd->dd_dbuf = NULL;
 137 
 138         for (t = 0; t < TXG_SIZE; t++) {
 139                 ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t));
 140                 ASSERT(dd->dd_tempreserved[t] == 0);
 141                 ASSERT(dd->dd_space_towrite[t] == 0);
 142         }
 143 
 144         if (dd->dd_parent)
 145                 dsl_dir_async_rele(dd->dd_parent, dd);
 146 
 147         spa_async_close(dd->dd_pool->dp_spa, dd);
 148 
 149         /*
 150          * The props callback list should have been cleaned up by
 151          * objset_evict().
 152          */
 153         list_destroy(&dd->dd_prop_cbs);
 154         mutex_destroy(&dd->dd_lock);
 155         kmem_free(dd, sizeof (dsl_dir_t));
 156 }
 157 
 158 int
 159 dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
 160     const char *tail, void *tag, dsl_dir_t **ddp)
 161 {
 162         dmu_buf_t *dbuf;
 163         dsl_dir_t *dd;
 164         int err;
 165 
 166         ASSERT(dsl_pool_config_held(dp));
 167 


 223                 if (dsl_dir_is_clone(dd)) {
 224                         dmu_buf_t *origin_bonus;
 225                         dsl_dataset_phys_t *origin_phys;
 226 
 227                         /*
 228                          * We can't open the origin dataset, because
 229                          * that would require opening this dsl_dir.
 230                          * Just look at its phys directly instead.
 231                          */
 232                         err = dmu_bonus_hold(dp->dp_meta_objset,
 233                             dsl_dir_phys(dd)->dd_origin_obj, FTAG,
 234                             &origin_bonus);
 235                         if (err != 0)
 236                                 goto errout;
 237                         origin_phys = origin_bonus->db_data;
 238                         dd->dd_origin_txg =
 239                             origin_phys->ds_creation_txg;
 240                         dmu_buf_rele(origin_bonus, FTAG);
 241                 }
 242 
 243                 dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf);
 244                 winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu);
 245                 if (winner != NULL) {
 246                         if (dd->dd_parent)
 247                                 dsl_dir_rele(dd->dd_parent, dd);
 248                         mutex_destroy(&dd->dd_lock);
 249                         kmem_free(dd, sizeof (dsl_dir_t));
 250                         dd = winner;
 251                 } else {
 252                         spa_open_ref(dp->dp_spa, dd);
 253                 }
 254         }
 255 
 256         /*
 257          * The dsl_dir_t has both open-to-close and instantiate-to-evict
 258          * holds on the spa.  We need the open-to-close holds because
 259          * otherwise the spa_refcnt wouldn't change when we open a
 260          * dir which the spa also has open, so we could incorrectly
 261          * think it was OK to unload/export/destroy the pool.  We need
 262          * the instantiate-to-evict hold because the dsl_dir_t has a
 263          * pointer to the dd_pool, which has a pointer to the spa_t.
 264          */
 265         spa_open_ref(dp->dp_spa, tag);


 269         *ddp = dd;
 270         return (0);
 271 
 272 errout:
 273         if (dd->dd_parent)
 274                 dsl_dir_rele(dd->dd_parent, dd);
 275         mutex_destroy(&dd->dd_lock);
 276         kmem_free(dd, sizeof (dsl_dir_t));
 277         dmu_buf_rele(dbuf, tag);
 278         return (err);
 279 }
 280 
 281 void
 282 dsl_dir_rele(dsl_dir_t *dd, void *tag)
 283 {
 284         dprintf_dd(dd, "%s\n", "");
 285         spa_close(dd->dd_pool->dp_spa, tag);
 286         dmu_buf_rele(dd->dd_dbuf, tag);
 287 }
 288 
 289 /*
 290  * Remove a reference to the given dsl dir that is being asynchronously
 291  * released.  Async releases occur from a taskq performing eviction of
 292  * dsl datasets and dirs.  This process is identical to a normal release
 293  * with the exception of using the async API for releasing the reference on
 294  * the spa.
 295  */
 296 void
 297 dsl_dir_async_rele(dsl_dir_t *dd, void *tag)
 298 {
 299         dprintf_dd(dd, "%s\n", "");
 300         spa_async_close(dd->dd_pool->dp_spa, tag);
 301         dmu_buf_rele(dd->dd_dbuf, tag);
 302 }
 303 
 304 /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */
 305 void
 306 dsl_dir_name(dsl_dir_t *dd, char *buf)
 307 {
 308         if (dd->dd_parent) {
 309                 dsl_dir_name(dd->dd_parent, buf);
 310                 (void) strcat(buf, "/");
 311         } else {
 312                 buf[0] = '\0';
 313         }
 314         if (!MUTEX_HELD(&dd->dd_lock)) {
 315                 /*
 316                  * recursive mutex so that we can use
 317                  * dprintf_dd() with dd_lock held
 318                  */
 319                 mutex_enter(&dd->dd_lock);
 320                 (void) strcat(buf, dd->dd_myname);
 321                 mutex_exit(&dd->dd_lock);
 322         } else {
 323                 (void) strcat(buf, dd->dd_myname);


 414         dsl_dir_t *dd;
 415         uint64_t ddobj;
 416 
 417         err = getcomponent(name, buf, &next);
 418         if (err != 0)
 419                 return (err);
 420 
 421         /* Make sure the name is in the specified pool. */
 422         spaname = spa_name(dp->dp_spa);
 423         if (strcmp(buf, spaname) != 0)
 424                 return (SET_ERROR(EINVAL));
 425 
 426         ASSERT(dsl_pool_config_held(dp));
 427 
 428         err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd);
 429         if (err != 0) {
 430                 return (err);
 431         }
 432 
 433         while (next != NULL) {
 434                 dsl_dir_t *child_dd;
 435                 err = getcomponent(next, buf, &nextnext);
 436                 if (err != 0)
 437                         break;
 438                 ASSERT(next[0] != '\0');
 439                 if (next[0] == '@')
 440                         break;
 441                 dprintf("looking up %s in obj%lld\n",
 442                     buf, dsl_dir_phys(dd)->dd_child_dir_zapobj);
 443 
 444                 err = zap_lookup(dp->dp_meta_objset,
 445                     dsl_dir_phys(dd)->dd_child_dir_zapobj,
 446                     buf, sizeof (ddobj), 1, &ddobj);
 447                 if (err != 0) {
 448                         if (err == ENOENT)
 449                                 err = 0;
 450                         break;
 451                 }
 452 
 453                 err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd);
 454                 if (err != 0)
 455                         break;
 456                 dsl_dir_rele(dd, tag);
 457                 dd = child_dd;
 458                 next = nextnext;
 459         }
 460 
 461         if (err != 0) {
 462                 dsl_dir_rele(dd, tag);
 463                 return (err);
 464         }
 465 
 466         /*
 467          * It's an error if there's more than one component left, or
 468          * tailp==NULL and there's any component left.
 469          */
 470         if (next != NULL &&
 471             (tailp == NULL || (nextnext && nextnext[0] != '\0'))) {
 472                 /* bad path name */
 473                 dsl_dir_rele(dd, tag);
 474                 dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp);
 475                 err = SET_ERROR(ENOENT);
 476         }
 477         if (tailp != NULL)