1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  */
  24 
  25 #include <sys/zfs_context.h>
  26 #include <sys/dmu_objset.h>
  27 #include <sys/dmu_traverse.h>
  28 #include <sys/dsl_dataset.h>
  29 #include <sys/dsl_dir.h>
  30 #include <sys/dsl_pool.h>
  31 #include <sys/dnode.h>
  32 #include <sys/spa.h>
  33 #include <sys/zio.h>
  34 #include <sys/dmu_impl.h>
  35 #include <sys/sa.h>
  36 #include <sys/sa_impl.h>
  37 #include <sys/callb.h>
  38 
  39 int zfs_pd_blks_max = 100;
  40 
  41 typedef struct prefetch_data {
  42         kmutex_t pd_mtx;
  43         kcondvar_t pd_cv;
  44         int pd_blks_max;
  45         int pd_blks_fetched;
  46         int pd_flags;
  47         boolean_t pd_cancel;
  48         boolean_t pd_exited;
  49 } prefetch_data_t;
  50 
  51 typedef struct traverse_data {
  52         spa_t *td_spa;
  53         uint64_t td_objset;
  54         blkptr_t *td_rootbp;
  55         uint64_t td_min_txg;
  56         int td_flags;
  57         prefetch_data_t *td_pfd;
  58         blkptr_cb_t *td_func;
  59         void *td_arg;
  60 } traverse_data_t;
  61 
  62 static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
  63     arc_buf_t *buf, uint64_t objset, uint64_t object);
  64 
  65 static int
  66 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
  67 {
  68         traverse_data_t *td = arg;
  69         zbookmark_t zb;
  70 
  71         if (bp->blk_birth == 0)
  72                 return (0);
  73 
  74         if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
  75                 return (0);
  76 
  77         SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
  78             bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
  79 
  80         (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL, td->td_arg);
  81 
  82         return (0);
  83 }
  84 
  85 static int
  86 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
  87 {
  88         traverse_data_t *td = arg;
  89 
  90         if (lrc->lrc_txtype == TX_WRITE) {
  91                 lr_write_t *lr = (lr_write_t *)lrc;
  92                 blkptr_t *bp = &lr->lr_blkptr;
  93                 zbookmark_t zb;
  94 
  95                 if (bp->blk_birth == 0)
  96                         return (0);
  97 
  98                 if (claim_txg == 0 || bp->blk_birth < claim_txg)
  99                         return (0);
 100 
 101                 SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid,
 102                     ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
 103 
 104                 (void) td->td_func(td->td_spa, zilog, bp, NULL, &zb, NULL,
 105                     td->td_arg);
 106         }
 107         return (0);
 108 }
 109 
 110 static void
 111 traverse_zil(traverse_data_t *td, zil_header_t *zh)
 112 {
 113         uint64_t claim_txg = zh->zh_claim_txg;
 114         zilog_t *zilog;
 115 
 116         /*
 117          * We only want to visit blocks that have been claimed but not yet
 118          * replayed; plus, in read-only mode, blocks that are already stable.
 119          */
 120         if (claim_txg == 0 && spa_writeable(td->td_spa))
 121                 return;
 122 
 123         zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
 124 
 125         (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td,
 126             claim_txg);
 127 
 128         zil_free(zilog);
 129 }
 130 
 131 static int
 132 traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp,
 133     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 134 {
 135         zbookmark_t czb;
 136         int err = 0, lasterr = 0;
 137         arc_buf_t *buf = NULL;
 138         prefetch_data_t *pd = td->td_pfd;
 139         boolean_t hard = td->td_flags & TRAVERSE_HARD;
 140 
 141         if (bp->blk_birth == 0) {
 142                 err = td->td_func(td->td_spa, NULL, NULL, pbuf, zb, dnp,
 143                     td->td_arg);
 144                 return (err);
 145         }
 146 
 147         if (bp->blk_birth <= td->td_min_txg)
 148                 return (0);
 149 
 150         if (pd && !pd->pd_exited &&
 151             ((pd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
 152             BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) {
 153                 mutex_enter(&pd->pd_mtx);
 154                 ASSERT(pd->pd_blks_fetched >= 0);
 155                 while (pd->pd_blks_fetched == 0 && !pd->pd_exited)
 156                         cv_wait(&pd->pd_cv, &pd->pd_mtx);
 157                 pd->pd_blks_fetched--;
 158                 cv_broadcast(&pd->pd_cv);
 159                 mutex_exit(&pd->pd_mtx);
 160         }
 161 
 162         if (td->td_flags & TRAVERSE_PRE) {
 163                 err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
 164                     td->td_arg);
 165                 if (err == TRAVERSE_VISIT_NO_CHILDREN)
 166                         return (0);
 167                 if (err)
 168                         return (err);
 169         }
 170 
 171         if (BP_GET_LEVEL(bp) > 0) {
 172                 uint32_t flags = ARC_WAIT;
 173                 int i;
 174                 blkptr_t *cbp;
 175                 int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT;
 176 
 177                 err = dsl_read(NULL, td->td_spa, bp, pbuf,
 178                     arc_getbuf_func, &buf,
 179                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 180                 if (err)
 181                         return (err);
 182 
 183                 /* recursively visitbp() blocks below this */
 184                 cbp = buf->b_data;
 185                 for (i = 0; i < epb; i++, cbp++) {
 186                         SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
 187                             zb->zb_level - 1,
 188                             zb->zb_blkid * epb + i);
 189                         err = traverse_visitbp(td, dnp, buf, cbp, &czb);
 190                         if (err) {
 191                                 if (!hard)
 192                                         break;
 193                                 lasterr = err;
 194                         }
 195                 }
 196         } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 197                 uint32_t flags = ARC_WAIT;
 198                 int i;
 199                 int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 200 
 201                 err = dsl_read(NULL, td->td_spa, bp, pbuf,
 202                     arc_getbuf_func, &buf,
 203                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 204                 if (err)
 205                         return (err);
 206 
 207                 /* recursively visitbp() blocks below this */
 208                 dnp = buf->b_data;
 209                 for (i = 0; i < epb; i++, dnp++) {
 210                         err = traverse_dnode(td, dnp, buf, zb->zb_objset,
 211                             zb->zb_blkid * epb + i);
 212                         if (err) {
 213                                 if (!hard)
 214                                         break;
 215                                 lasterr = err;
 216                         }
 217                 }
 218         } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 219                 uint32_t flags = ARC_WAIT;
 220                 objset_phys_t *osp;
 221                 dnode_phys_t *dnp;
 222 
 223                 err = dsl_read_nolock(NULL, td->td_spa, bp,
 224                     arc_getbuf_func, &buf,
 225                     ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb);
 226                 if (err)
 227                         return (err);
 228 
 229                 osp = buf->b_data;
 230                 dnp = &osp->os_meta_dnode;
 231                 err = traverse_dnode(td, dnp, buf, zb->zb_objset,
 232                     DMU_META_DNODE_OBJECT);
 233                 if (err && hard) {
 234                         lasterr = err;
 235                         err = 0;
 236                 }
 237                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 238                         dnp = &osp->os_userused_dnode;
 239                         err = traverse_dnode(td, dnp, buf, zb->zb_objset,
 240                             DMU_USERUSED_OBJECT);
 241                 }
 242                 if (err && hard) {
 243                         lasterr = err;
 244                         err = 0;
 245                 }
 246                 if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
 247                         dnp = &osp->os_groupused_dnode;
 248                         err = traverse_dnode(td, dnp, buf, zb->zb_objset,
 249                             DMU_GROUPUSED_OBJECT);
 250                 }
 251         }
 252 
 253         if (buf)
 254                 (void) arc_buf_remove_ref(buf, &buf);
 255 
 256         if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) {
 257                 err = td->td_func(td->td_spa, NULL, bp, pbuf, zb, dnp,
 258                     td->td_arg);
 259         }
 260 
 261         return (err != 0 ? err : lasterr);
 262 }
 263 
 264 static int
 265 traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp,
 266     arc_buf_t *buf, uint64_t objset, uint64_t object)
 267 {
 268         int j, err = 0, lasterr = 0;
 269         zbookmark_t czb;
 270         boolean_t hard = (td->td_flags & TRAVERSE_HARD);
 271 
 272         for (j = 0; j < dnp->dn_nblkptr; j++) {
 273                 SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
 274                 err = traverse_visitbp(td, dnp, buf,
 275                     (blkptr_t *)&dnp->dn_blkptr[j], &czb);
 276                 if (err) {
 277                         if (!hard)
 278                                 break;
 279                         lasterr = err;
 280                 }
 281         }
 282 
 283         if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) {
 284                 SET_BOOKMARK(&czb, objset,
 285                     object, 0, DMU_SPILL_BLKID);
 286                 err = traverse_visitbp(td, dnp, buf,
 287                     (blkptr_t *)&dnp->dn_spill, &czb);
 288                 if (err) {
 289                         if (!hard)
 290                                 return (err);
 291                         lasterr = err;
 292                 }
 293         }
 294         return (err != 0 ? err : lasterr);
 295 }
 296 
 297 /* ARGSUSED */
 298 static int
 299 traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
 300     arc_buf_t *pbuf, const zbookmark_t *zb, const dnode_phys_t *dnp,
 301     void *arg)
 302 {
 303         prefetch_data_t *pfd = arg;
 304         uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 305 
 306         ASSERT(pfd->pd_blks_fetched >= 0);
 307         if (pfd->pd_cancel)
 308                 return (EINTR);
 309 
 310         if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
 311             BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
 312             BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 313                 return (0);
 314 
 315         mutex_enter(&pfd->pd_mtx);
 316         while (!pfd->pd_cancel && pfd->pd_blks_fetched >= pfd->pd_blks_max)
 317                 cv_wait(&pfd->pd_cv, &pfd->pd_mtx);
 318         pfd->pd_blks_fetched++;
 319         cv_broadcast(&pfd->pd_cv);
 320         mutex_exit(&pfd->pd_mtx);
 321 
 322         (void) dsl_read(NULL, spa, bp, pbuf, NULL, NULL,
 323             ZIO_PRIORITY_ASYNC_READ,
 324             ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 325             &aflags, zb);
 326 
 327         return (0);
 328 }
 329 
 330 static void
 331 traverse_prefetch_thread(void *arg)
 332 {
 333         traverse_data_t *td_main = arg;
 334         traverse_data_t td = *td_main;
 335         zbookmark_t czb;
 336 
 337         td.td_func = traverse_prefetcher;
 338         td.td_arg = td_main->td_pfd;
 339         td.td_pfd = NULL;
 340 
 341         SET_BOOKMARK(&czb, td.td_objset,
 342             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 343         (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
 344 
 345         mutex_enter(&td_main->td_pfd->pd_mtx);
 346         td_main->td_pfd->pd_exited = B_TRUE;
 347         cv_broadcast(&td_main->td_pfd->pd_cv);
 348         mutex_exit(&td_main->td_pfd->pd_mtx);
 349 }
 350 
 351 /*
 352  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 353  * in syncing context).
 354  */
 355 static int
 356 traverse_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *rootbp,
 357     uint64_t txg_start, int flags, blkptr_cb_t func, void *arg)
 358 {
 359         traverse_data_t td;
 360         prefetch_data_t pd = { 0 };
 361         zbookmark_t czb;
 362         int err;
 363 
 364         td.td_spa = spa;
 365         td.td_objset = ds ? ds->ds_object : 0;
 366         td.td_rootbp = rootbp;
 367         td.td_min_txg = txg_start;
 368         td.td_func = func;
 369         td.td_arg = arg;
 370         td.td_pfd = &pd;
 371         td.td_flags = flags;
 372 
 373         pd.pd_blks_max = zfs_pd_blks_max;
 374         pd.pd_flags = flags;
 375         mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL);
 376         cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL);
 377 
 378         /* See comment on ZIL traversal in dsl_scan_visitds. */
 379         if (ds != NULL && !dsl_dataset_is_snapshot(ds)) {
 380                 objset_t *os;
 381 
 382                 err = dmu_objset_from_ds(ds, &os);
 383                 if (err)
 384                         return (err);
 385 
 386                 traverse_zil(&td, &os->os_zil_header);
 387         }
 388 
 389         if (!(flags & TRAVERSE_PREFETCH) ||
 390             0 == taskq_dispatch(system_taskq, traverse_prefetch_thread,
 391             &td, TQ_NOQUEUE))
 392                 pd.pd_exited = B_TRUE;
 393 
 394         SET_BOOKMARK(&czb, td.td_objset,
 395             ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 396         err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
 397 
 398         mutex_enter(&pd.pd_mtx);
 399         pd.pd_cancel = B_TRUE;
 400         cv_broadcast(&pd.pd_cv);
 401         while (!pd.pd_exited)
 402                 cv_wait(&pd.pd_cv, &pd.pd_mtx);
 403         mutex_exit(&pd.pd_mtx);
 404 
 405         mutex_destroy(&pd.pd_mtx);
 406         cv_destroy(&pd.pd_cv);
 407 
 408         return (err);
 409 }
 410 
 411 /*
 412  * NB: dataset must not be changing on-disk (eg, is a snapshot or we are
 413  * in syncing context).
 414  */
 415 int
 416 traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
 417     blkptr_cb_t func, void *arg)
 418 {
 419         return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds,
 420             &ds->ds_phys->ds_bp, txg_start, flags, func, arg));
 421 }
 422 
 423 /*
 424  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
 425  */
 426 int
 427 traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
 428     blkptr_cb_t func, void *arg)
 429 {
 430         int err, lasterr = 0;
 431         uint64_t obj;
 432         dsl_pool_t *dp = spa_get_dsl(spa);
 433         objset_t *mos = dp->dp_meta_objset;
 434         boolean_t hard = (flags & TRAVERSE_HARD);
 435 
 436         /* visit the MOS */
 437         err = traverse_impl(spa, NULL, spa_get_rootblkptr(spa),
 438             txg_start, flags, func, arg);
 439         if (err)
 440                 return (err);
 441 
 442         /* visit each dataset */
 443         for (obj = 1; err == 0 || (err != ESRCH && hard);
 444             err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 445                 dmu_object_info_t doi;
 446 
 447                 err = dmu_object_info(mos, obj, &doi);
 448                 if (err) {
 449                         if (!hard)
 450                                 return (err);
 451                         lasterr = err;
 452                         continue;
 453                 }
 454 
 455                 if (doi.doi_type == DMU_OT_DSL_DATASET) {
 456                         dsl_dataset_t *ds;
 457                         uint64_t txg = txg_start;
 458 
 459                         rw_enter(&dp->dp_config_rwlock, RW_READER);
 460                         err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 461                         rw_exit(&dp->dp_config_rwlock);
 462                         if (err) {
 463                                 if (!hard)
 464                                         return (err);
 465                                 lasterr = err;
 466                                 continue;
 467                         }
 468                         if (ds->ds_phys->ds_prev_snap_txg > txg)
 469                                 txg = ds->ds_phys->ds_prev_snap_txg;
 470                         err = traverse_dataset(ds, txg, flags, func, arg);
 471                         dsl_dataset_rele(ds, FTAG);
 472                         if (err) {
 473                                 if (!hard)
 474                                         return (err);
 475                                 lasterr = err;
 476                         }
 477                 }
 478         }
 479         if (err == ESRCH)
 480                 err = 0;
 481         return (err != 0 ? err : lasterr);
 482 }