1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2012 by Delphix. All rights reserved.
  24  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  25  */
  26 
  27 #include <sys/dsl_dataset.h>
  28 #include <sys/dmu.h>
  29 #include <sys/refcount.h>
  30 #include <sys/zap.h>
  31 #include <sys/zfs_context.h>
  32 #include <sys/dsl_pool.h>
  33 
  34 /*
  35  * Deadlist concurrency:
  36  *
  37  * Deadlists can only be modified from the syncing thread.
  38  *
  39  * Except for dsl_deadlist_insert(), it can only be modified with the
  40  * dp_config_rwlock held with RW_WRITER.
  41  *
  42  * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can
  43  * be called concurrently, from open context, with the dl_config_rwlock held
  44  * with RW_READER.
  45  *
  46  * Therefore, we only need to provide locking between dsl_deadlist_insert() and
  47  * the accessors, protecting:
  48  *     dl_phys->dl_used,comp,uncomp
  49  *     and protecting the dl_tree from being loaded.
  50  * The locking is provided by dl_lock.  Note that locking on the bpobj_t
  51  * provides its own locking, and dl_oldfmt is immutable.
  52  */
  53 
  54 static int
  55 dsl_deadlist_compare(const void *arg1, const void *arg2)
  56 {
  57         const dsl_deadlist_entry_t *dle1 = arg1;
  58         const dsl_deadlist_entry_t *dle2 = arg2;
  59 
  60         if (dle1->dle_mintxg < dle2->dle_mintxg)
  61                 return (-1);
  62         else if (dle1->dle_mintxg > dle2->dle_mintxg)
  63                 return (+1);
  64         else
  65                 return (0);
  66 }
  67 
  68 static void
  69 dsl_deadlist_load_tree(dsl_deadlist_t *dl)
  70 {
  71         zap_cursor_t zc;
  72         zap_attribute_t za;
  73 
  74         ASSERT(!dl->dl_oldfmt);
  75         if (dl->dl_havetree)
  76                 return;
  77 
  78         avl_create(&dl->dl_tree, dsl_deadlist_compare,
  79             sizeof (dsl_deadlist_entry_t),
  80             offsetof(dsl_deadlist_entry_t, dle_node));
  81         for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object);
  82             zap_cursor_retrieve(&zc, &za) == 0;
  83             zap_cursor_advance(&zc)) {
  84                 dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
  85                 dle->dle_mintxg = strtonum(za.za_name, NULL);
  86                 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os,
  87                     za.za_first_integer));
  88                 avl_add(&dl->dl_tree, dle);
  89         }
  90         zap_cursor_fini(&zc);
  91         dl->dl_havetree = B_TRUE;
  92 }
  93 
  94 void
  95 dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object)
  96 {
  97         dmu_object_info_t doi;
  98 
  99         mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL);
 100         dl->dl_os = os;
 101         dl->dl_object = object;
 102         VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf));
 103         dmu_object_info_from_db(dl->dl_dbuf, &doi);
 104         if (doi.doi_type == DMU_OT_BPOBJ) {
 105                 dmu_buf_rele(dl->dl_dbuf, dl);
 106                 dl->dl_dbuf = NULL;
 107                 dl->dl_oldfmt = B_TRUE;
 108                 VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object));
 109                 return;
 110         }
 111 
 112         dl->dl_oldfmt = B_FALSE;
 113         dl->dl_phys = dl->dl_dbuf->db_data;
 114         dl->dl_havetree = B_FALSE;
 115 }
 116 
 117 void
 118 dsl_deadlist_close(dsl_deadlist_t *dl)
 119 {
 120         void *cookie = NULL;
 121         dsl_deadlist_entry_t *dle;
 122 
 123         dl->dl_os = NULL;
 124 
 125         if (dl->dl_oldfmt) {
 126                 dl->dl_oldfmt = B_FALSE;
 127                 bpobj_close(&dl->dl_bpobj);
 128                 return;
 129         }
 130 
 131         if (dl->dl_havetree) {
 132                 while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie))
 133                     != NULL) {
 134                         bpobj_close(&dle->dle_bpobj);
 135                         kmem_free(dle, sizeof (*dle));
 136                 }
 137                 avl_destroy(&dl->dl_tree);
 138         }
 139         dmu_buf_rele(dl->dl_dbuf, dl);
 140         mutex_destroy(&dl->dl_lock);
 141         dl->dl_dbuf = NULL;
 142         dl->dl_phys = NULL;
 143 }
 144 
 145 uint64_t
 146 dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx)
 147 {
 148         if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS)
 149                 return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx));
 150         return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR,
 151             sizeof (dsl_deadlist_phys_t), tx));
 152 }
 153 
 154 void
 155 dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx)
 156 {
 157         dmu_object_info_t doi;
 158         zap_cursor_t zc;
 159         zap_attribute_t za;
 160 
 161         VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi));
 162         if (doi.doi_type == DMU_OT_BPOBJ) {
 163                 bpobj_free(os, dlobj, tx);
 164                 return;
 165         }
 166 
 167         for (zap_cursor_init(&zc, os, dlobj);
 168             zap_cursor_retrieve(&zc, &za) == 0;
 169             zap_cursor_advance(&zc)) {
 170                 uint64_t obj = za.za_first_integer;
 171                 if (obj == dmu_objset_pool(os)->dp_empty_bpobj)
 172                         bpobj_decr_empty(os, tx);
 173                 else
 174                         bpobj_free(os, obj, tx);
 175         }
 176         zap_cursor_fini(&zc);
 177         VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx));
 178 }
 179 
 180 static void
 181 dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 182     const blkptr_t *bp, dmu_tx_t *tx)
 183 {
 184         if (dle->dle_bpobj.bpo_object ==
 185             dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 186                 uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 187                 bpobj_close(&dle->dle_bpobj);
 188                 bpobj_decr_empty(dl->dl_os, tx);
 189                 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 190                 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
 191                     dle->dle_mintxg, obj, tx));
 192         }
 193         bpobj_enqueue(&dle->dle_bpobj, bp, tx);
 194 }
 195 
 196 static void
 197 dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle,
 198     uint64_t obj, dmu_tx_t *tx)
 199 {
 200         if (dle->dle_bpobj.bpo_object !=
 201             dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) {
 202                 bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx);
 203         } else {
 204                 bpobj_close(&dle->dle_bpobj);
 205                 bpobj_decr_empty(dl->dl_os, tx);
 206                 VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 207                 VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object,
 208                     dle->dle_mintxg, obj, tx));
 209         }
 210 }
 211 
 212 void
 213 dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx)
 214 {
 215         dsl_deadlist_entry_t dle_tofind;
 216         dsl_deadlist_entry_t *dle;
 217         avl_index_t where;
 218 
 219         if (dl->dl_oldfmt) {
 220                 bpobj_enqueue(&dl->dl_bpobj, bp, tx);
 221                 return;
 222         }
 223 
 224         dsl_deadlist_load_tree(dl);
 225 
 226         dmu_buf_will_dirty(dl->dl_dbuf, tx);
 227         mutex_enter(&dl->dl_lock);
 228         dl->dl_phys->dl_used +=
 229             bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp);
 230         dl->dl_phys->dl_comp += BP_GET_PSIZE(bp);
 231         dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp);
 232         mutex_exit(&dl->dl_lock);
 233 
 234         dle_tofind.dle_mintxg = bp->blk_birth;
 235         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 236         if (dle == NULL)
 237                 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 238         else
 239                 dle = AVL_PREV(&dl->dl_tree, dle);
 240         dle_enqueue(dl, dle, bp, tx);
 241 }
 242 
 243 /*
 244  * Insert new key in deadlist, which must be > all current entries.
 245  * mintxg is not inclusive.
 246  */
 247 void
 248 dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 249 {
 250         uint64_t obj;
 251         dsl_deadlist_entry_t *dle;
 252 
 253         if (dl->dl_oldfmt)
 254                 return;
 255 
 256         dsl_deadlist_load_tree(dl);
 257 
 258         dle = kmem_alloc(sizeof (*dle), KM_SLEEP);
 259         dle->dle_mintxg = mintxg;
 260         obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 261         VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj));
 262         avl_add(&dl->dl_tree, dle);
 263 
 264         VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object,
 265             mintxg, obj, tx));
 266 }
 267 
 268 /*
 269  * Remove this key, merging its entries into the previous key.
 270  */
 271 void
 272 dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx)
 273 {
 274         dsl_deadlist_entry_t dle_tofind;
 275         dsl_deadlist_entry_t *dle, *dle_prev;
 276 
 277         if (dl->dl_oldfmt)
 278                 return;
 279 
 280         dsl_deadlist_load_tree(dl);
 281 
 282         dle_tofind.dle_mintxg = mintxg;
 283         dle = avl_find(&dl->dl_tree, &dle_tofind, NULL);
 284         dle_prev = AVL_PREV(&dl->dl_tree, dle);
 285 
 286         dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx);
 287 
 288         avl_remove(&dl->dl_tree, dle);
 289         bpobj_close(&dle->dle_bpobj);
 290         kmem_free(dle, sizeof (*dle));
 291 
 292         VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx));
 293 }
 294 
 295 /*
 296  * Walk ds's snapshots to regenerate generate ZAP & AVL.
 297  */
 298 static void
 299 dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj,
 300     uint64_t mrs_obj, dmu_tx_t *tx)
 301 {
 302         dsl_deadlist_t dl;
 303         dsl_pool_t *dp = dmu_objset_pool(os);
 304 
 305         dsl_deadlist_open(&dl, os, dlobj);
 306         if (dl.dl_oldfmt) {
 307                 dsl_deadlist_close(&dl);
 308                 return;
 309         }
 310 
 311         while (mrs_obj != 0) {
 312                 dsl_dataset_t *ds;
 313                 VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds));
 314                 dsl_deadlist_add_key(&dl,
 315                     dsl_dataset_phys(ds)->ds_prev_snap_txg, tx);
 316                 mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj;
 317                 dsl_dataset_rele(ds, FTAG);
 318         }
 319         dsl_deadlist_close(&dl);
 320 }
 321 
 322 uint64_t
 323 dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg,
 324     uint64_t mrs_obj, dmu_tx_t *tx)
 325 {
 326         dsl_deadlist_entry_t *dle;
 327         uint64_t newobj;
 328 
 329         newobj = dsl_deadlist_alloc(dl->dl_os, tx);
 330 
 331         if (dl->dl_oldfmt) {
 332                 dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx);
 333                 return (newobj);
 334         }
 335 
 336         dsl_deadlist_load_tree(dl);
 337 
 338         for (dle = avl_first(&dl->dl_tree); dle;
 339             dle = AVL_NEXT(&dl->dl_tree, dle)) {
 340                 uint64_t obj;
 341 
 342                 if (dle->dle_mintxg >= maxtxg)
 343                         break;
 344 
 345                 obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx);
 346                 VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj,
 347                     dle->dle_mintxg, obj, tx));
 348         }
 349         return (newobj);
 350 }
 351 
 352 void
 353 dsl_deadlist_space(dsl_deadlist_t *dl,
 354     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 355 {
 356         if (dl->dl_oldfmt) {
 357                 VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj,
 358                     usedp, compp, uncompp));
 359                 return;
 360         }
 361 
 362         mutex_enter(&dl->dl_lock);
 363         *usedp = dl->dl_phys->dl_used;
 364         *compp = dl->dl_phys->dl_comp;
 365         *uncompp = dl->dl_phys->dl_uncomp;
 366         mutex_exit(&dl->dl_lock);
 367 }
 368 
 369 /*
 370  * return space used in the range (mintxg, maxtxg].
 371  * Includes maxtxg, does not include mintxg.
 372  * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is
 373  * larger than any bp in the deadlist (eg. UINT64_MAX)).
 374  */
 375 void
 376 dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg,
 377     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 378 {
 379         dsl_deadlist_entry_t *dle;
 380         dsl_deadlist_entry_t dle_tofind;
 381         avl_index_t where;
 382 
 383         if (dl->dl_oldfmt) {
 384                 VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj,
 385                     mintxg, maxtxg, usedp, compp, uncompp));
 386                 return;
 387         }
 388 
 389         *usedp = *compp = *uncompp = 0;
 390 
 391         mutex_enter(&dl->dl_lock);
 392         dsl_deadlist_load_tree(dl);
 393         dle_tofind.dle_mintxg = mintxg;
 394         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 395         /*
 396          * If we don't find this mintxg, there shouldn't be anything
 397          * after it either.
 398          */
 399         ASSERT(dle != NULL ||
 400             avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL);
 401 
 402         for (; dle && dle->dle_mintxg < maxtxg;
 403             dle = AVL_NEXT(&dl->dl_tree, dle)) {
 404                 uint64_t used, comp, uncomp;
 405 
 406                 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
 407                     &used, &comp, &uncomp));
 408 
 409                 *usedp += used;
 410                 *compp += comp;
 411                 *uncompp += uncomp;
 412         }
 413         mutex_exit(&dl->dl_lock);
 414 }
 415 
 416 static void
 417 dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth,
 418     dmu_tx_t *tx)
 419 {
 420         dsl_deadlist_entry_t dle_tofind;
 421         dsl_deadlist_entry_t *dle;
 422         avl_index_t where;
 423         uint64_t used, comp, uncomp;
 424         bpobj_t bpo;
 425 
 426         VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
 427         VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp));
 428         bpobj_close(&bpo);
 429 
 430         dsl_deadlist_load_tree(dl);
 431 
 432         dmu_buf_will_dirty(dl->dl_dbuf, tx);
 433         mutex_enter(&dl->dl_lock);
 434         dl->dl_phys->dl_used += used;
 435         dl->dl_phys->dl_comp += comp;
 436         dl->dl_phys->dl_uncomp += uncomp;
 437         mutex_exit(&dl->dl_lock);
 438 
 439         dle_tofind.dle_mintxg = birth;
 440         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 441         if (dle == NULL)
 442                 dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE);
 443         dle_enqueue_subobj(dl, dle, obj, tx);
 444 }
 445 
 446 static int
 447 dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 448 {
 449         dsl_deadlist_t *dl = arg;
 450         dsl_deadlist_insert(dl, bp, tx);
 451         return (0);
 452 }
 453 
 454 /*
 455  * Merge the deadlist pointed to by 'obj' into dl.  obj will be left as
 456  * an empty deadlist.
 457  */
 458 void
 459 dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx)
 460 {
 461         zap_cursor_t zc;
 462         zap_attribute_t za;
 463         dmu_buf_t *bonus;
 464         dsl_deadlist_phys_t *dlp;
 465         dmu_object_info_t doi;
 466 
 467         VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi));
 468         if (doi.doi_type == DMU_OT_BPOBJ) {
 469                 bpobj_t bpo;
 470                 VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj));
 471                 VERIFY3U(0, ==, bpobj_iterate(&bpo,
 472                     dsl_deadlist_insert_cb, dl, tx));
 473                 bpobj_close(&bpo);
 474                 return;
 475         }
 476 
 477         for (zap_cursor_init(&zc, dl->dl_os, obj);
 478             zap_cursor_retrieve(&zc, &za) == 0;
 479             zap_cursor_advance(&zc)) {
 480                 uint64_t mintxg = strtonum(za.za_name, NULL);
 481                 dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx);
 482                 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx));
 483         }
 484         zap_cursor_fini(&zc);
 485 
 486         VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus));
 487         dlp = bonus->db_data;
 488         dmu_buf_will_dirty(bonus, tx);
 489         bzero(dlp, sizeof (*dlp));
 490         dmu_buf_rele(bonus, FTAG);
 491 }
 492 
 493 /*
 494  * Remove entries on dl that are >= mintxg, and put them on the bpobj.
 495  */
 496 void
 497 dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg,
 498     dmu_tx_t *tx)
 499 {
 500         dsl_deadlist_entry_t dle_tofind;
 501         dsl_deadlist_entry_t *dle;
 502         avl_index_t where;
 503 
 504         ASSERT(!dl->dl_oldfmt);
 505         dmu_buf_will_dirty(dl->dl_dbuf, tx);
 506         dsl_deadlist_load_tree(dl);
 507 
 508         dle_tofind.dle_mintxg = mintxg;
 509         dle = avl_find(&dl->dl_tree, &dle_tofind, &where);
 510         if (dle == NULL)
 511                 dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER);
 512         while (dle) {
 513                 uint64_t used, comp, uncomp;
 514                 dsl_deadlist_entry_t *dle_next;
 515 
 516                 bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx);
 517 
 518                 VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj,
 519                     &used, &comp, &uncomp));
 520                 mutex_enter(&dl->dl_lock);
 521                 ASSERT3U(dl->dl_phys->dl_used, >=, used);
 522                 ASSERT3U(dl->dl_phys->dl_comp, >=, comp);
 523                 ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp);
 524                 dl->dl_phys->dl_used -= used;
 525                 dl->dl_phys->dl_comp -= comp;
 526                 dl->dl_phys->dl_uncomp -= uncomp;
 527                 mutex_exit(&dl->dl_lock);
 528 
 529                 VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object,
 530                     dle->dle_mintxg, tx));
 531 
 532                 dle_next = AVL_NEXT(&dl->dl_tree, dle);
 533                 avl_remove(&dl->dl_tree, dle);
 534                 bpobj_close(&dle->dle_bpobj);
 535                 kmem_free(dle, sizeof (*dle));
 536                 dle = dle_next;
 537         }
 538 }