1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  24  * Copyright (c) 2012 by Delphix. All rights reserved.
  25  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  26  */
  27 
  28 #include <sys/dmu.h>
  29 #include <sys/dmu_impl.h>
  30 #include <sys/dmu_tx.h>
  31 #include <sys/dbuf.h>
  32 #include <sys/dnode.h>
  33 #include <sys/zfs_context.h>
  34 #include <sys/dmu_objset.h>
  35 #include <sys/dmu_traverse.h>
  36 #include <sys/dsl_dataset.h>
  37 #include <sys/dsl_dir.h>
  38 #include <sys/dsl_prop.h>
  39 #include <sys/dsl_pool.h>
  40 #include <sys/dsl_synctask.h>
  41 #include <sys/zfs_ioctl.h>
  42 #include <sys/zap.h>
  43 #include <sys/zio_checksum.h>
  44 #include <sys/zfs_znode.h>
  45 #include <zfs_fletcher.h>
  46 #include <sys/avl.h>
  47 #include <sys/ddt.h>
  48 #include <sys/zfs_onexit.h>
  49 
  50 /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
  51 int zfs_send_corrupt_data = B_FALSE;
  52 
  53 static char *dmu_recv_tag = "dmu_recv_tag";
  54 
  55 static int
  56 dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
  57 {
  58         dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
  59         ssize_t resid; /* have to get resid to get detailed errno */
  60         ASSERT3U(len % 8, ==, 0);
  61 
  62         dsp->dsa_err = 0;
  63         if (!dsp->sendsize) {
  64                 fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
  65                 dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
  66                     (caddr_t)buf, len,
  67                     0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY,
  68                     CRED(), &resid);
  69         }
  70         mutex_enter(&ds->ds_sendstream_lock);
  71         *dsp->dsa_off += len;
  72         mutex_exit(&ds->ds_sendstream_lock);
  73 
  74         return (dsp->dsa_err);
  75 }
  76 
  77 static int
  78 dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
  79     uint64_t length)
  80 {
  81         struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
  82 
  83         if (length != -1ULL && offset + length < offset)
  84                 length = -1ULL;
  85 
  86         /*
  87          * If there is a pending op, but it's not PENDING_FREE, push it out,
  88          * since free block aggregation can only be done for blocks of the
  89          * same type (i.e., DRR_FREE records can only be aggregated with
  90          * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
  91          * aggregated with other DRR_FREEOBJECTS records.
  92          */
  93         if (dsp->dsa_pending_op != PENDING_NONE &&
  94             dsp->dsa_pending_op != PENDING_FREE) {
  95                 if (dump_bytes(dsp, dsp->dsa_drr,
  96                     sizeof (dmu_replay_record_t)) != 0)
  97                         return (EINTR);
  98                 dsp->dsa_pending_op = PENDING_NONE;
  99         }
 100 
 101         if (dsp->dsa_pending_op == PENDING_FREE) {
 102                 /*
 103                  * There should never be a PENDING_FREE if length is -1
 104                  * (because dump_dnode is the only place where this
 105                  * function is called with a -1, and only after flushing
 106                  * any pending record).
 107                  */
 108                 ASSERT(length != -1ULL);
 109                 /*
 110                  * Check to see whether this free block can be aggregated
 111                  * with pending one.
 112                  */
 113                 if (drrf->drr_object == object && drrf->drr_offset +
 114                     drrf->drr_length == offset) {
 115                         drrf->drr_length += length;
 116                         return (0);
 117                 } else {
 118                         /* not a continuation.  Push out pending record */
 119                         if (dump_bytes(dsp, dsp->dsa_drr,
 120                             sizeof (dmu_replay_record_t)) != 0)
 121                                 return (EINTR);
 122                         dsp->dsa_pending_op = PENDING_NONE;
 123                 }
 124         }
 125         /* create a FREE record and make it pending */
 126         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 127         dsp->dsa_drr->drr_type = DRR_FREE;
 128         drrf->drr_object = object;
 129         drrf->drr_offset = offset;
 130         drrf->drr_length = length;
 131         drrf->drr_toguid = dsp->dsa_toguid;
 132         if (length == -1ULL) {
 133                 if (dump_bytes(dsp, dsp->dsa_drr,
 134                     sizeof (dmu_replay_record_t)) != 0)
 135                         return (EINTR);
 136         } else {
 137                 dsp->dsa_pending_op = PENDING_FREE;
 138         }
 139 
 140         return (0);
 141 }
 142 
 143 static int
 144 dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
 145     uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 146 {
 147         struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 148 
 149 
 150         /*
 151          * If there is any kind of pending aggregation (currently either
 152          * a grouping of free objects or free blocks), push it out to
 153          * the stream, since aggregation can't be done across operations
 154          * of different types.
 155          */
 156         if (dsp->dsa_pending_op != PENDING_NONE) {
 157                 if (dump_bytes(dsp, dsp->dsa_drr,
 158                     sizeof (dmu_replay_record_t)) != 0)
 159                         return (EINTR);
 160                 dsp->dsa_pending_op = PENDING_NONE;
 161         }
 162         /* write a DATA record */
 163         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 164         dsp->dsa_drr->drr_type = DRR_WRITE;
 165         drrw->drr_object = object;
 166         drrw->drr_type = type;
 167         drrw->drr_offset = offset;
 168         drrw->drr_length = blksz;
 169         drrw->drr_toguid = dsp->dsa_toguid;
 170         drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 171         if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
 172                 drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
 173         DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 174         DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 175         DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 176         drrw->drr_key.ddk_cksum = bp->blk_cksum;
 177 
 178         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 179                 return (EINTR);
 180         if (dump_bytes(dsp, data, blksz) != 0)
 181                 return (EINTR);
 182         return (0);
 183 }
 184 
 185 static int
 186 dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 187 {
 188         struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 189 
 190         if (dsp->dsa_pending_op != PENDING_NONE) {
 191                 if (dump_bytes(dsp, dsp->dsa_drr,
 192                     sizeof (dmu_replay_record_t)) != 0)
 193                         return (EINTR);
 194                 dsp->dsa_pending_op = PENDING_NONE;
 195         }
 196 
 197         /* write a SPILL record */
 198         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 199         dsp->dsa_drr->drr_type = DRR_SPILL;
 200         drrs->drr_object = object;
 201         drrs->drr_length = blksz;
 202         drrs->drr_toguid = dsp->dsa_toguid;
 203 
 204         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
 205                 return (EINTR);
 206         if (dump_bytes(dsp, data, blksz))
 207                 return (EINTR);
 208         return (0);
 209 }
 210 
 211 static int
 212 dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 213 {
 214         struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 215 
 216         /*
 217          * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 218          * push it out, since free block aggregation can only be done for
 219          * blocks of the same type (i.e., DRR_FREE records can only be
 220          * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 221          * can only be aggregated with other DRR_FREEOBJECTS records.
 222          */
 223         if (dsp->dsa_pending_op != PENDING_NONE &&
 224             dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
 225                 if (dump_bytes(dsp, dsp->dsa_drr,
 226                     sizeof (dmu_replay_record_t)) != 0)
 227                         return (EINTR);
 228                 dsp->dsa_pending_op = PENDING_NONE;
 229         }
 230         if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 231                 /*
 232                  * See whether this free object array can be aggregated
 233                  * with pending one
 234                  */
 235                 if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 236                         drrfo->drr_numobjs += numobjs;
 237                         return (0);
 238                 } else {
 239                         /* can't be aggregated.  Push out pending record */
 240                         if (dump_bytes(dsp, dsp->dsa_drr,
 241                             sizeof (dmu_replay_record_t)) != 0)
 242                                 return (EINTR);
 243                         dsp->dsa_pending_op = PENDING_NONE;
 244                 }
 245         }
 246 
 247         /* write a FREEOBJECTS record */
 248         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 249         dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 250         drrfo->drr_firstobj = firstobj;
 251         drrfo->drr_numobjs = numobjs;
 252         drrfo->drr_toguid = dsp->dsa_toguid;
 253 
 254         dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 255 
 256         return (0);
 257 }
 258 
 259 static int
 260 dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 261 {
 262         struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 263 
 264         if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 265                 return (dump_freeobjects(dsp, object, 1));
 266 
 267         if (dsp->dsa_pending_op != PENDING_NONE) {
 268                 if (dump_bytes(dsp, dsp->dsa_drr,
 269                     sizeof (dmu_replay_record_t)) != 0)
 270                         return (EINTR);
 271                 dsp->dsa_pending_op = PENDING_NONE;
 272         }
 273 
 274         /* write an OBJECT record */
 275         bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 276         dsp->dsa_drr->drr_type = DRR_OBJECT;
 277         drro->drr_object = object;
 278         drro->drr_type = dnp->dn_type;
 279         drro->drr_bonustype = dnp->dn_bonustype;
 280         drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 281         drro->drr_bonuslen = dnp->dn_bonuslen;
 282         drro->drr_checksumtype = dnp->dn_checksum;
 283         drro->drr_compress = dnp->dn_compress;
 284         drro->drr_toguid = dsp->dsa_toguid;
 285 
 286         if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 287                 return (EINTR);
 288 
 289         if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
 290                 return (EINTR);
 291 
 292         /* free anything past the end of the file */
 293         if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 294             (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
 295                 return (EINTR);
 296         if (dsp->dsa_err)
 297                 return (EINTR);
 298         return (0);
 299 }
 300 
 301 #define BP_SPAN(dnp, level) \
 302         (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 303         (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 304 
 305 /* ARGSUSED */
 306 static int
 307 backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 308     const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 309 {
 310         dmu_sendarg_t *dsp = arg;
 311         dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 312         int err = 0;
 313 
 314         if (issig(JUSTLOOKING) && issig(FORREAL))
 315                 return (EINTR);
 316 
 317         if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 318             DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 319                 return (0);
 320         } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
 321                 uint64_t span = BP_SPAN(dnp, zb->zb_level);
 322                 uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 323                 err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
 324         } else if (bp == NULL) {
 325                 uint64_t span = BP_SPAN(dnp, zb->zb_level);
 326                 err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
 327         } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 328                 return (0);
 329         } else if (type == DMU_OT_DNODE) {
 330                 dnode_phys_t *blk;
 331                 int i;
 332                 int blksz = BP_GET_LSIZE(bp);
 333                 uint32_t aflags = ARC_WAIT;
 334                 arc_buf_t *abuf;
 335 
 336                 if (dsl_read(NULL, spa, bp, pbuf,
 337                     arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 338                     ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 339                         return (EIO);
 340 
 341                 blk = abuf->b_data;
 342                 for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
 343                         uint64_t dnobj = (zb->zb_blkid <<
 344                             (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
 345                         err = dump_dnode(dsp, dnobj, blk+i);
 346                         if (err)
 347                                 break;
 348                 }
 349                 (void) arc_buf_remove_ref(abuf, &abuf);
 350         } else if (type == DMU_OT_SA) {
 351                 uint32_t aflags = ARC_WAIT;
 352                 arc_buf_t *abuf;
 353                 int blksz = BP_GET_LSIZE(bp);
 354 
 355                 if (arc_read_nolock(NULL, spa, bp,
 356                     arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 357                     ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 358                         return (EIO);
 359 
 360                 err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
 361                 (void) arc_buf_remove_ref(abuf, &abuf);
 362         } else { /* it's a level-0 block of a regular object */
 363                 uint32_t aflags = ARC_WAIT;
 364                 arc_buf_t *abuf = NULL;
 365                 void *buf = NULL;
 366                 int blksz = BP_GET_LSIZE(bp);
 367 
 368                 if (!dsp->sendsize) {
 369                         if (dsl_read(NULL, spa, bp, pbuf,
 370                             arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 371                             ZIO_FLAG_CANFAIL, &aflags, zb) != 0) {
 372                                 if (zfs_send_corrupt_data) {
 373                                 /* Send a block filled with 0x"zfs badd bloc" */
 374                                         abuf = arc_buf_alloc(spa, blksz, &abuf,
 375                                             ARC_BUFC_DATA);
 376                                         uint64_t *ptr;
 377                                         for (ptr = abuf->b_data;
 378                                             (char *)ptr <
 379                                             (char *)abuf->b_data + blksz;
 380                                             ptr++)
 381                                                 *ptr = 0x2f5baddb10c;
 382                                 } else {
 383                                         return (EIO);
 384                                 }
 385                         }
 386                         buf = abuf->b_data;
 387                 }
 388 
 389                 err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
 390                     blksz, bp, buf);
 391                 if (!dsp->sendsize) {
 392                         (void) arc_buf_remove_ref(abuf, &abuf);
 393                 }
 394         }
 395 
 396         ASSERT(err == 0 || err == EINTR);
 397         return (err);
 398 }
 399 
 400 /*
 401  * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
 402  * For example, they could both be snapshots of the same filesystem, and
 403  * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
 404  * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
 405  * filesystem.  Or 'earlier' could be the origin's origin.
 406  */
 407 static boolean_t
 408 is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
 409 {
 410         dsl_pool_t *dp = later->ds_dir->dd_pool;
 411         int error;
 412         boolean_t ret;
 413         dsl_dataset_t *origin;
 414 
 415         if (earlier->ds_phys->ds_creation_txg >=
 416             later->ds_phys->ds_creation_txg)
 417                 return (B_FALSE);
 418 
 419         if (later->ds_dir == earlier->ds_dir)
 420                 return (B_TRUE);
 421         if (!dsl_dir_is_clone(later->ds_dir))
 422                 return (B_FALSE);
 423 
 424         rw_enter(&dp->dp_config_rwlock, RW_READER);
 425         if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) {
 426                 rw_exit(&dp->dp_config_rwlock);
 427                 return (B_TRUE);
 428         }
 429         error = dsl_dataset_hold_obj(dp,
 430             later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
 431         rw_exit(&dp->dp_config_rwlock);
 432         if (error != 0)
 433                 return (B_FALSE);
 434         ret = is_before(origin, earlier);
 435         dsl_dataset_rele(origin, FTAG);
 436         return (ret);
 437 }
 438 
 439 
 440 int
 441 dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
 442     offset_t *off, boolean_t sendsize)
 443 {
 444         dsl_dataset_t *ds = tosnap->os_dsl_dataset;
 445         dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 446         dmu_replay_record_t *drr;
 447         dmu_sendarg_t *dsp;
 448         int err;
 449         uint64_t fromtxg = 0;
 450 
 451         /* tosnap must be a snapshot */
 452         if (ds->ds_phys->ds_next_snap_obj == 0)
 453                 return (EINVAL);
 454 
 455         /*
 456          * fromsnap must be an earlier snapshot from the same fs as tosnap,
 457          * or the origin's fs.
 458          */
 459         if (fromds != NULL && !is_before(ds, fromds))
 460                 return (EXDEV);
 461 
 462         drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 463         drr->drr_type = DRR_BEGIN;
 464         drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 465         DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 466             DMU_SUBSTREAM);
 467 
 468 #ifdef _KERNEL
 469         if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
 470                 uint64_t version;
 471                 if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) {
 472                         kmem_free(drr, sizeof (dmu_replay_record_t));
 473                         return (EINVAL);
 474                 }
 475                 if (version == ZPL_VERSION_SA) {
 476                         DMU_SET_FEATUREFLAGS(
 477                             drr->drr_u.drr_begin.drr_versioninfo,
 478                             DMU_BACKUP_FEATURE_SA_SPILL);
 479                 }
 480         }
 481 #endif
 482 
 483         drr->drr_u.drr_begin.drr_creation_time =
 484             ds->ds_phys->ds_creation_time;
 485         drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
 486         if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
 487                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 488         drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
 489         if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 490                 drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 491 
 492         if (fromds)
 493                 drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
 494         dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 495 
 496         if (fromds)
 497                 fromtxg = fromds->ds_phys->ds_creation_txg;
 498 
 499         dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 500 
 501         dsp->dsa_drr = drr;
 502         dsp->dsa_vp = vp;
 503         dsp->dsa_outfd = outfd;
 504         dsp->dsa_proc = curproc;
 505         dsp->dsa_os = tosnap;
 506         dsp->dsa_off = off;
 507         dsp->dsa_toguid = ds->ds_phys->ds_guid;
 508         ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
 509         dsp->dsa_pending_op = PENDING_NONE;
 510         dsp->sendsize = sendsize;
 511 
 512         mutex_enter(&ds->ds_sendstream_lock);
 513         list_insert_head(&ds->ds_sendstreams, dsp);
 514         mutex_exit(&ds->ds_sendstream_lock);
 515 
 516         if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 517                 err = dsp->dsa_err;
 518                 goto out;
 519         }
 520 
 521         if (dsp->sendsize) {
 522                 err = traverse_dataset(ds, fromtxg,
 523                     TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 524                     backup_cb, dsp);
 525         } else {
 526                 err = traverse_dataset(ds,
 527                     fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
 528                     backup_cb, dsp);
 529         }
 530 
 531         if (dsp->dsa_pending_op != PENDING_NONE)
 532                 if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
 533                         err = EINTR;
 534 
 535         if (err) {
 536                 if (err == EINTR && dsp->dsa_err)
 537                         err = dsp->dsa_err;
 538                 goto out;
 539         }
 540 
 541         bzero(drr, sizeof (dmu_replay_record_t));
 542         drr->drr_type = DRR_END;
 543         drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 544         drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 545 
 546         if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 547                 err = dsp->dsa_err;
 548                 goto out;
 549         }
 550 
 551 out:
 552         mutex_enter(&ds->ds_sendstream_lock);
 553         list_remove(&ds->ds_sendstreams, dsp);
 554         mutex_exit(&ds->ds_sendstream_lock);
 555 
 556         kmem_free(drr, sizeof (dmu_replay_record_t));
 557         kmem_free(dsp, sizeof (dmu_sendarg_t));
 558 
 559         return (err);
 560 }
 561 
 562 int
 563 dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
 564 {
 565         dsl_dataset_t *ds = tosnap->os_dsl_dataset;
 566         dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 567         dsl_pool_t *dp = ds->ds_dir->dd_pool;
 568         int err;
 569         uint64_t size;
 570 
 571         /* tosnap must be a snapshot */
 572         if (ds->ds_phys->ds_next_snap_obj == 0)
 573                 return (EINVAL);
 574 
 575         /*
 576          * fromsnap must be an earlier snapshot from the same fs as tosnap,
 577          * or the origin's fs.
 578          */
 579         if (fromds != NULL && !is_before(ds, fromds))
 580                 return (EXDEV);
 581 
 582         /* Get uncompressed size estimate of changed data. */
 583         if (fromds == NULL) {
 584                 size = ds->ds_phys->ds_uncompressed_bytes;
 585         } else {
 586                 uint64_t used, comp;
 587                 err = dsl_dataset_space_written(fromds, ds,
 588                     &used, &comp, &size);
 589                 if (err)
 590                         return (err);
 591         }
 592 
 593         /*
 594          * Assume that space (both on-disk and in-stream) is dominated by
 595          * data.  We will adjust for indirect blocks and the copies property,
 596          * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
 597          */
 598 
 599         /*
 600          * Subtract out approximate space used by indirect blocks.
 601          * Assume most space is used by data blocks (non-indirect, non-dnode).
 602          * Assume all blocks are recordsize.  Assume ditto blocks and
 603          * internal fragmentation counter out compression.
 604          *
 605          * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
 606          * block, which we observe in practice.
 607          */
 608         uint64_t recordsize;
 609         rw_enter(&dp->dp_config_rwlock, RW_READER);
 610         err = dsl_prop_get_ds(ds, "recordsize",
 611             sizeof (recordsize), 1, &recordsize, NULL);
 612         rw_exit(&dp->dp_config_rwlock);
 613         if (err)
 614                 return (err);
 615         size -= size / recordsize * sizeof (blkptr_t);
 616 
 617         /* Add in the space for the record associated with each block. */
 618         size += size / recordsize * sizeof (dmu_replay_record_t);
 619 
 620         *sizep = size;
 621 
 622         return (0);
 623 }
 624 
 625 struct recvbeginsyncarg {
 626         const char *tofs;
 627         const char *tosnap;
 628         dsl_dataset_t *origin;
 629         uint64_t fromguid;
 630         dmu_objset_type_t type;
 631         void *tag;
 632         boolean_t force;
 633         uint64_t dsflags;
 634         char clonelastname[MAXNAMELEN];
 635         dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
 636         cred_t *cr;
 637 };
 638 
 639 /* ARGSUSED */
 640 static int
 641 recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 642 {
 643         dsl_dir_t *dd = arg1;
 644         struct recvbeginsyncarg *rbsa = arg2;
 645         objset_t *mos = dd->dd_pool->dp_meta_objset;
 646         uint64_t val;
 647         int err;
 648 
 649         err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
 650             strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
 651 
 652         if (err != ENOENT)
 653                 return (err ? err : EEXIST);
 654 
 655         if (rbsa->origin) {
 656                 /* make sure it's a snap in the same pool */
 657                 if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
 658                         return (EXDEV);
 659                 if (!dsl_dataset_is_snapshot(rbsa->origin))
 660                         return (EINVAL);
 661                 if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
 662                         return (ENODEV);
 663         }
 664 
 665         return (0);
 666 }
 667 
 668 static void
 669 recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 670 {
 671         dsl_dir_t *dd = arg1;
 672         struct recvbeginsyncarg *rbsa = arg2;
 673         uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 674         uint64_t dsobj;
 675 
 676         /* Create and open new dataset. */
 677         dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
 678             rbsa->origin, flags, rbsa->cr, tx);
 679         VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
 680             B_TRUE, dmu_recv_tag, &rbsa->ds));
 681 
 682         if (rbsa->origin == NULL) {
 683                 (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
 684                     rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
 685         }
 686 
 687         spa_history_log_internal_ds(rbsa->ds, "receive new", tx, "");
 688 }
 689 
 690 /* ARGSUSED */
 691 static int
 692 recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 693 {
 694         dsl_dataset_t *ds = arg1;
 695         struct recvbeginsyncarg *rbsa = arg2;
 696         int err;
 697         uint64_t val;
 698 
 699         /* must not have any changes since most recent snapshot */
 700         if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
 701                 return (ETXTBSY);
 702 
 703         /* new snapshot name must not exist */
 704         err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 705             ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
 706         if (err == 0)
 707                 return (EEXIST);
 708         if (err != ENOENT)
 709                 return (err);
 710 
 711         if (rbsa->fromguid) {
 712                 /* if incremental, most recent snapshot must match fromguid */
 713                 if (ds->ds_prev == NULL)
 714                         return (ENODEV);
 715 
 716                 /*
 717                  * most recent snapshot must match fromguid, or there are no
 718                  * changes since the fromguid one
 719                  */
 720                 if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
 721                         uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
 722                         uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
 723                         while (obj != 0) {
 724                                 dsl_dataset_t *snap;
 725                                 err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 726                                     obj, FTAG, &snap);
 727                                 if (err)
 728                                         return (ENODEV);
 729                                 if (snap->ds_phys->ds_creation_txg < birth) {
 730                                         dsl_dataset_rele(snap, FTAG);
 731                                         return (ENODEV);
 732                                 }
 733                                 if (snap->ds_phys->ds_guid == rbsa->fromguid) {
 734                                         dsl_dataset_rele(snap, FTAG);
 735                                         break; /* it's ok */
 736                                 }
 737                                 obj = snap->ds_phys->ds_prev_snap_obj;
 738                                 dsl_dataset_rele(snap, FTAG);
 739                         }
 740                         if (obj == 0)
 741                                 return (ENODEV);
 742                 }
 743         } else {
 744                 /* if full, most recent snapshot must be $ORIGIN */
 745                 if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
 746                         return (ENODEV);
 747         }
 748 
 749         /* temporary clone name must not exist */
 750         err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 751             ds->ds_dir->dd_phys->dd_child_dir_zapobj,
 752             rbsa->clonelastname, 8, 1, &val);
 753         if (err == 0)
 754                 return (EEXIST);
 755         if (err != ENOENT)
 756                 return (err);
 757 
 758         return (0);
 759 }
 760 
 761 /* ARGSUSED */
 762 static void
 763 recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 764 {
 765         dsl_dataset_t *ohds = arg1;
 766         struct recvbeginsyncarg *rbsa = arg2;
 767         dsl_pool_t *dp = ohds->ds_dir->dd_pool;
 768         dsl_dataset_t *cds;
 769         uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 770         uint64_t dsobj;
 771 
 772         /* create and open the temporary clone */
 773         dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
 774             ohds->ds_prev, flags, rbsa->cr, tx);
 775         VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
 776 
 777         /*
 778          * If we actually created a non-clone, we need to create the
 779          * objset in our new dataset.
 780          */
 781         if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
 782                 (void) dmu_objset_create_impl(dp->dp_spa,
 783                     cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
 784         }
 785 
 786         rbsa->ds = cds;
 787 
 788         spa_history_log_internal_ds(cds, "receive over existing", tx, "");
 789 }
 790 
 791 static boolean_t
 792 dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
 793 {
 794         int featureflags;
 795 
 796         featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 797 
 798         /* Verify pool version supports SA if SA_SPILL feature set */
 799         return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 800             (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
 801 }
 802 
 803 /*
 804  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
 805  * succeeds; otherwise we will leak the holds on the datasets.
 806  */
 807 int
 808 dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
 809     boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
 810 {
 811         int err = 0;
 812         boolean_t byteswap;
 813         struct recvbeginsyncarg rbsa = { 0 };
 814         uint64_t versioninfo;
 815         int flags;
 816         dsl_dataset_t *ds;
 817 
 818         if (drrb->drr_magic == DMU_BACKUP_MAGIC)
 819                 byteswap = FALSE;
 820         else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
 821                 byteswap = TRUE;
 822         else
 823                 return (EINVAL);
 824 
 825         rbsa.tofs = tofs;
 826         rbsa.tosnap = tosnap;
 827         rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
 828         rbsa.fromguid = drrb->drr_fromguid;
 829         rbsa.type = drrb->drr_type;
 830         rbsa.tag = FTAG;
 831         rbsa.dsflags = 0;
 832         rbsa.cr = CRED();
 833         versioninfo = drrb->drr_versioninfo;
 834         flags = drrb->drr_flags;
 835 
 836         if (byteswap) {
 837                 rbsa.type = BSWAP_32(rbsa.type);
 838                 rbsa.fromguid = BSWAP_64(rbsa.fromguid);
 839                 versioninfo = BSWAP_64(versioninfo);
 840                 flags = BSWAP_32(flags);
 841         }
 842 
 843         if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
 844             rbsa.type >= DMU_OST_NUMTYPES ||
 845             ((flags & DRR_FLAG_CLONE) && origin == NULL))
 846                 return (EINVAL);
 847 
 848         if (flags & DRR_FLAG_CI_DATA)
 849                 rbsa.dsflags = DS_FLAG_CI_DATASET;
 850 
 851         bzero(drc, sizeof (dmu_recv_cookie_t));
 852         drc->drc_drrb = drrb;
 853         drc->drc_tosnap = tosnap;
 854         drc->drc_top_ds = top_ds;
 855         drc->drc_force = force;
 856 
 857         /*
 858          * Process the begin in syncing context.
 859          */
 860 
 861         /* open the dataset we are logically receiving into */
 862         err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
 863         if (err == 0) {
 864                 if (dmu_recv_verify_features(ds, drrb)) {
 865                         dsl_dataset_rele(ds, dmu_recv_tag);
 866                         return (ENOTSUP);
 867                 }
 868                 /* target fs already exists; recv into temp clone */
 869 
 870                 /* Can't recv a clone into an existing fs */
 871                 if (flags & DRR_FLAG_CLONE) {
 872                         dsl_dataset_rele(ds, dmu_recv_tag);
 873                         return (EINVAL);
 874                 }
 875 
 876                 /* must not have an incremental recv already in progress */
 877                 if (!mutex_tryenter(&ds->ds_recvlock)) {
 878                         dsl_dataset_rele(ds, dmu_recv_tag);
 879                         return (EBUSY);
 880                 }
 881 
 882                 /* tmp clone name is: tofs/%tosnap" */
 883                 (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
 884                     "%%%s", tosnap);
 885                 rbsa.force = force;
 886                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 887                     recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
 888                 if (err) {
 889                         mutex_exit(&ds->ds_recvlock);
 890                         dsl_dataset_rele(ds, dmu_recv_tag);
 891                         return (err);
 892                 }
 893                 drc->drc_logical_ds = ds;
 894                 drc->drc_real_ds = rbsa.ds;
 895         } else if (err == ENOENT) {
 896                 /* target fs does not exist; must be a full backup or clone */
 897                 char *cp;
 898 
 899                 /*
 900                  * If it's a non-clone incremental, we are missing the
 901                  * target fs, so fail the recv.
 902                  */
 903                 if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
 904                         return (ENOENT);
 905 
 906                 /* Open the parent of tofs */
 907                 cp = strrchr(tofs, '/');
 908                 *cp = '\0';
 909                 err = dsl_dataset_hold(tofs, FTAG, &ds);
 910                 *cp = '/';
 911                 if (err)
 912                         return (err);
 913 
 914                 if (dmu_recv_verify_features(ds, drrb)) {
 915                         dsl_dataset_rele(ds, FTAG);
 916                         return (ENOTSUP);
 917                 }
 918 
 919                 err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 920                     recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
 921                 dsl_dataset_rele(ds, FTAG);
 922                 if (err)
 923                         return (err);
 924                 drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
 925                 drc->drc_newfs = B_TRUE;
 926         }
 927 
 928         return (err);
 929 }
 930 
 931 struct restorearg {
 932         int err;
 933         int byteswap;
 934         vnode_t *vp;
 935         char *buf;
 936         uint64_t voff;
 937         int bufsize; /* amount of memory allocated for buf */
 938         zio_cksum_t cksum;
 939         avl_tree_t *guid_to_ds_map;
 940 };
 941 
 942 typedef struct guid_map_entry {
 943         uint64_t        guid;
 944         dsl_dataset_t   *gme_ds;
 945         avl_node_t      avlnode;
 946 } guid_map_entry_t;
 947 
 948 static int
 949 guid_compare(const void *arg1, const void *arg2)
 950 {
 951         const guid_map_entry_t *gmep1 = arg1;
 952         const guid_map_entry_t *gmep2 = arg2;
 953 
 954         if (gmep1->guid < gmep2->guid)
 955                 return (-1);
 956         else if (gmep1->guid > gmep2->guid)
 957                 return (1);
 958         return (0);
 959 }
 960 
 961 static void
 962 free_guid_map_onexit(void *arg)
 963 {
 964         avl_tree_t *ca = arg;
 965         void *cookie = NULL;
 966         guid_map_entry_t *gmep;
 967 
 968         while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
 969                 dsl_dataset_rele(gmep->gme_ds, ca);
 970                 kmem_free(gmep, sizeof (guid_map_entry_t));
 971         }
 972         avl_destroy(ca);
 973         kmem_free(ca, sizeof (avl_tree_t));
 974 }
 975 
 976 static void *
 977 restore_read(struct restorearg *ra, int len)
 978 {
 979         void *rv;
 980         int done = 0;
 981 
 982         /* some things will require 8-byte alignment, so everything must */
 983         ASSERT3U(len % 8, ==, 0);
 984 
 985         while (done < len) {
 986                 ssize_t resid;
 987 
 988                 ra->err = vn_rdwr(UIO_READ, ra->vp,
 989                     (caddr_t)ra->buf + done, len - done,
 990                     ra->voff, UIO_SYSSPACE, FAPPEND,
 991                     RLIM64_INFINITY, CRED(), &resid);
 992 
 993                 if (resid == len - done)
 994                         ra->err = EINVAL;
 995                 ra->voff += len - done - resid;
 996                 done = len - resid;
 997                 if (ra->err)
 998                         return (NULL);
 999         }
1000 
1001         ASSERT3U(done, ==, len);
1002         rv = ra->buf;
1003         if (ra->byteswap)
1004                 fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
1005         else
1006                 fletcher_4_incremental_native(rv, len, &ra->cksum);
1007         return (rv);
1008 }
1009 
1010 static void
1011 backup_byteswap(dmu_replay_record_t *drr)
1012 {
1013 #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
1014 #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
1015         drr->drr_type = BSWAP_32(drr->drr_type);
1016         drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
1017         switch (drr->drr_type) {
1018         case DRR_BEGIN:
1019                 DO64(drr_begin.drr_magic);
1020                 DO64(drr_begin.drr_versioninfo);
1021                 DO64(drr_begin.drr_creation_time);
1022                 DO32(drr_begin.drr_type);
1023                 DO32(drr_begin.drr_flags);
1024                 DO64(drr_begin.drr_toguid);
1025                 DO64(drr_begin.drr_fromguid);
1026                 break;
1027         case DRR_OBJECT:
1028                 DO64(drr_object.drr_object);
1029                 /* DO64(drr_object.drr_allocation_txg); */
1030                 DO32(drr_object.drr_type);
1031                 DO32(drr_object.drr_bonustype);
1032                 DO32(drr_object.drr_blksz);
1033                 DO32(drr_object.drr_bonuslen);
1034                 DO64(drr_object.drr_toguid);
1035                 break;
1036         case DRR_FREEOBJECTS:
1037                 DO64(drr_freeobjects.drr_firstobj);
1038                 DO64(drr_freeobjects.drr_numobjs);
1039                 DO64(drr_freeobjects.drr_toguid);
1040                 break;
1041         case DRR_WRITE:
1042                 DO64(drr_write.drr_object);
1043                 DO32(drr_write.drr_type);
1044                 DO64(drr_write.drr_offset);
1045                 DO64(drr_write.drr_length);
1046                 DO64(drr_write.drr_toguid);
1047                 DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
1048                 DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
1049                 DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
1050                 DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
1051                 DO64(drr_write.drr_key.ddk_prop);
1052                 break;
1053         case DRR_WRITE_BYREF:
1054                 DO64(drr_write_byref.drr_object);
1055                 DO64(drr_write_byref.drr_offset);
1056                 DO64(drr_write_byref.drr_length);
1057                 DO64(drr_write_byref.drr_toguid);
1058                 DO64(drr_write_byref.drr_refguid);
1059                 DO64(drr_write_byref.drr_refobject);
1060                 DO64(drr_write_byref.drr_refoffset);
1061                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
1062                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
1063                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
1064                 DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
1065                 DO64(drr_write_byref.drr_key.ddk_prop);
1066                 break;
1067         case DRR_FREE:
1068                 DO64(drr_free.drr_object);
1069                 DO64(drr_free.drr_offset);
1070                 DO64(drr_free.drr_length);
1071                 DO64(drr_free.drr_toguid);
1072                 break;
1073         case DRR_SPILL:
1074                 DO64(drr_spill.drr_object);
1075                 DO64(drr_spill.drr_length);
1076                 DO64(drr_spill.drr_toguid);
1077                 break;
1078         case DRR_END:
1079                 DO64(drr_end.drr_checksum.zc_word[0]);
1080                 DO64(drr_end.drr_checksum.zc_word[1]);
1081                 DO64(drr_end.drr_checksum.zc_word[2]);
1082                 DO64(drr_end.drr_checksum.zc_word[3]);
1083                 DO64(drr_end.drr_toguid);
1084                 break;
1085         }
1086 #undef DO64
1087 #undef DO32
1088 }
1089 
1090 static int
1091 restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1092 {
1093         int err;
1094         dmu_tx_t *tx;
1095         void *data = NULL;
1096 
1097         if (drro->drr_type == DMU_OT_NONE ||
1098             !DMU_OT_IS_VALID(drro->drr_type) ||
1099             !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1100             drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1101             drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1102             P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1103             drro->drr_blksz < SPA_MINBLOCKSIZE ||
1104             drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1105             drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1106                 return (EINVAL);
1107         }
1108 
1109         err = dmu_object_info(os, drro->drr_object, NULL);
1110 
1111         if (err != 0 && err != ENOENT)
1112                 return (EINVAL);
1113 
1114         if (drro->drr_bonuslen) {
1115                 data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1116                 if (ra->err)
1117                         return (ra->err);
1118         }
1119 
1120         if (err == ENOENT) {
1121                 /* currently free, want to be allocated */
1122                 tx = dmu_tx_create(os);
1123                 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1124                 err = dmu_tx_assign(tx, TXG_WAIT);
1125                 if (err) {
1126                         dmu_tx_abort(tx);
1127                         return (err);
1128                 }
1129                 err = dmu_object_claim(os, drro->drr_object,
1130                     drro->drr_type, drro->drr_blksz,
1131                     drro->drr_bonustype, drro->drr_bonuslen, tx);
1132                 dmu_tx_commit(tx);
1133         } else {
1134                 /* currently allocated, want to be allocated */
1135                 err = dmu_object_reclaim(os, drro->drr_object,
1136                     drro->drr_type, drro->drr_blksz,
1137                     drro->drr_bonustype, drro->drr_bonuslen);
1138         }
1139         if (err) {
1140                 return (EINVAL);
1141         }
1142 
1143         tx = dmu_tx_create(os);
1144         dmu_tx_hold_bonus(tx, drro->drr_object);
1145         err = dmu_tx_assign(tx, TXG_WAIT);
1146         if (err) {
1147                 dmu_tx_abort(tx);
1148                 return (err);
1149         }
1150 
1151         dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1152             tx);
1153         dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1154 
1155         if (data != NULL) {
1156                 dmu_buf_t *db;
1157 
1158                 VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1159                 dmu_buf_will_dirty(db, tx);
1160 
1161                 ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1162                 bcopy(data, db->db_data, drro->drr_bonuslen);
1163                 if (ra->byteswap) {
1164                         dmu_object_byteswap_t byteswap =
1165                             DMU_OT_BYTESWAP(drro->drr_bonustype);
1166                         dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1167                             drro->drr_bonuslen);
1168                 }
1169                 dmu_buf_rele(db, FTAG);
1170         }
1171         dmu_tx_commit(tx);
1172         return (0);
1173 }
1174 
1175 /* ARGSUSED */
1176 static int
1177 restore_freeobjects(struct restorearg *ra, objset_t *os,
1178     struct drr_freeobjects *drrfo)
1179 {
1180         uint64_t obj;
1181 
1182         if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1183                 return (EINVAL);
1184 
1185         for (obj = drrfo->drr_firstobj;
1186             obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1187             (void) dmu_object_next(os, &obj, FALSE, 0)) {
1188                 int err;
1189 
1190                 if (dmu_object_info(os, obj, NULL) != 0)
1191                         continue;
1192 
1193                 err = dmu_free_object(os, obj);
1194                 if (err)
1195                         return (err);
1196         }
1197         return (0);
1198 }
1199 
1200 static int
1201 restore_write(struct restorearg *ra, objset_t *os,
1202     struct drr_write *drrw)
1203 {
1204         dmu_tx_t *tx;
1205         void *data;
1206         int err;
1207 
1208         if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1209             !DMU_OT_IS_VALID(drrw->drr_type))
1210                 return (EINVAL);
1211 
1212         data = restore_read(ra, drrw->drr_length);
1213         if (data == NULL)
1214                 return (ra->err);
1215 
1216         if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1217                 return (EINVAL);
1218 
1219         tx = dmu_tx_create(os);
1220 
1221         dmu_tx_hold_write(tx, drrw->drr_object,
1222             drrw->drr_offset, drrw->drr_length);
1223         err = dmu_tx_assign(tx, TXG_WAIT);
1224         if (err) {
1225                 dmu_tx_abort(tx);
1226                 return (err);
1227         }
1228         if (ra->byteswap) {
1229                 dmu_object_byteswap_t byteswap =
1230                     DMU_OT_BYTESWAP(drrw->drr_type);
1231                 dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
1232         }
1233         dmu_write(os, drrw->drr_object,
1234             drrw->drr_offset, drrw->drr_length, data, tx);
1235         dmu_tx_commit(tx);
1236         return (0);
1237 }
1238 
1239 /*
1240  * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1241  * streams to refer to a copy of the data that is already on the
1242  * system because it came in earlier in the stream.  This function
1243  * finds the earlier copy of the data, and uses that copy instead of
1244  * data from the stream to fulfill this write.
1245  */
1246 static int
1247 restore_write_byref(struct restorearg *ra, objset_t *os,
1248     struct drr_write_byref *drrwbr)
1249 {
1250         dmu_tx_t *tx;
1251         int err;
1252         guid_map_entry_t gmesrch;
1253         guid_map_entry_t *gmep;
1254         avl_index_t     where;
1255         objset_t *ref_os = NULL;
1256         dmu_buf_t *dbp;
1257 
1258         if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1259                 return (EINVAL);
1260 
1261         /*
1262          * If the GUID of the referenced dataset is different from the
1263          * GUID of the target dataset, find the referenced dataset.
1264          */
1265         if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1266                 gmesrch.guid = drrwbr->drr_refguid;
1267                 if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
1268                     &where)) == NULL) {
1269                         return (EINVAL);
1270                 }
1271                 if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1272                         return (EINVAL);
1273         } else {
1274                 ref_os = os;
1275         }
1276 
1277         if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1278             drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
1279                 return (err);
1280 
1281         tx = dmu_tx_create(os);
1282 
1283         dmu_tx_hold_write(tx, drrwbr->drr_object,
1284             drrwbr->drr_offset, drrwbr->drr_length);
1285         err = dmu_tx_assign(tx, TXG_WAIT);
1286         if (err) {
1287                 dmu_tx_abort(tx);
1288                 return (err);
1289         }
1290         dmu_write(os, drrwbr->drr_object,
1291             drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1292         dmu_buf_rele(dbp, FTAG);
1293         dmu_tx_commit(tx);
1294         return (0);
1295 }
1296 
1297 static int
1298 restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1299 {
1300         dmu_tx_t *tx;
1301         void *data;
1302         dmu_buf_t *db, *db_spill;
1303         int err;
1304 
1305         if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1306             drrs->drr_length > SPA_MAXBLOCKSIZE)
1307                 return (EINVAL);
1308 
1309         data = restore_read(ra, drrs->drr_length);
1310         if (data == NULL)
1311                 return (ra->err);
1312 
1313         if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1314                 return (EINVAL);
1315 
1316         VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1317         if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1318                 dmu_buf_rele(db, FTAG);
1319                 return (err);
1320         }
1321 
1322         tx = dmu_tx_create(os);
1323 
1324         dmu_tx_hold_spill(tx, db->db_object);
1325 
1326         err = dmu_tx_assign(tx, TXG_WAIT);
1327         if (err) {
1328                 dmu_buf_rele(db, FTAG);
1329                 dmu_buf_rele(db_spill, FTAG);
1330                 dmu_tx_abort(tx);
1331                 return (err);
1332         }
1333         dmu_buf_will_dirty(db_spill, tx);
1334 
1335         if (db_spill->db_size < drrs->drr_length)
1336                 VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1337                     drrs->drr_length, tx));
1338         bcopy(data, db_spill->db_data, drrs->drr_length);
1339 
1340         dmu_buf_rele(db, FTAG);
1341         dmu_buf_rele(db_spill, FTAG);
1342 
1343         dmu_tx_commit(tx);
1344         return (0);
1345 }
1346 
1347 /* ARGSUSED */
1348 static int
1349 restore_free(struct restorearg *ra, objset_t *os,
1350     struct drr_free *drrf)
1351 {
1352         int err;
1353 
1354         if (drrf->drr_length != -1ULL &&
1355             drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1356                 return (EINVAL);
1357 
1358         if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1359                 return (EINVAL);
1360 
1361         err = dmu_free_long_range(os, drrf->drr_object,
1362             drrf->drr_offset, drrf->drr_length);
1363         return (err);
1364 }
1365 
1366 /*
1367  * NB: callers *must* call dmu_recv_end() if this succeeds.
1368  */
1369 int
1370 dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
1371     int cleanup_fd, uint64_t *action_handlep)
1372 {
1373         struct restorearg ra = { 0 };
1374         dmu_replay_record_t *drr;
1375         objset_t *os;
1376         zio_cksum_t pcksum;
1377         int featureflags;
1378 
1379         if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1380                 ra.byteswap = TRUE;
1381 
1382         {
1383                 /* compute checksum of drr_begin record */
1384                 dmu_replay_record_t *drr;
1385                 drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1386 
1387                 drr->drr_type = DRR_BEGIN;
1388                 drr->drr_u.drr_begin = *drc->drc_drrb;
1389                 if (ra.byteswap) {
1390                         fletcher_4_incremental_byteswap(drr,
1391                             sizeof (dmu_replay_record_t), &ra.cksum);
1392                 } else {
1393                         fletcher_4_incremental_native(drr,
1394                             sizeof (dmu_replay_record_t), &ra.cksum);
1395                 }
1396                 kmem_free(drr, sizeof (dmu_replay_record_t));
1397         }
1398 
1399         if (ra.byteswap) {
1400                 struct drr_begin *drrb = drc->drc_drrb;
1401                 drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1402                 drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1403                 drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1404                 drrb->drr_type = BSWAP_32(drrb->drr_type);
1405                 drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1406                 drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1407         }
1408 
1409         ra.vp = vp;
1410         ra.voff = *voffp;
1411         ra.bufsize = 1<<20;
1412         ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1413 
1414         /* these were verified in dmu_recv_begin */
1415         ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
1416             DMU_SUBSTREAM);
1417         ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
1418 
1419         /*
1420          * Open the objset we are modifying.
1421          */
1422         VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
1423 
1424         ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1425 
1426         featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1427 
1428         /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1429         if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1430                 minor_t minor;
1431 
1432                 if (cleanup_fd == -1) {
1433                         ra.err = EBADF;
1434                         goto out;
1435                 }
1436                 ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
1437                 if (ra.err) {
1438                         cleanup_fd = -1;
1439                         goto out;
1440                 }
1441 
1442                 if (*action_handlep == 0) {
1443                         ra.guid_to_ds_map =
1444                             kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1445                         avl_create(ra.guid_to_ds_map, guid_compare,
1446                             sizeof (guid_map_entry_t),
1447                             offsetof(guid_map_entry_t, avlnode));
1448                         ra.err = zfs_onexit_add_cb(minor,
1449                             free_guid_map_onexit, ra.guid_to_ds_map,
1450                             action_handlep);
1451                         if (ra.err)
1452                                 goto out;
1453                 } else {
1454                         ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1455                             (void **)&ra.guid_to_ds_map);
1456                         if (ra.err)
1457                                 goto out;
1458                 }
1459 
1460                 drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
1461         }
1462 
1463         /*
1464          * Read records and process them.
1465          */
1466         pcksum = ra.cksum;
1467         while (ra.err == 0 &&
1468             NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1469                 if (issig(JUSTLOOKING) && issig(FORREAL)) {
1470                         ra.err = EINTR;
1471                         goto out;
1472                 }
1473 
1474                 if (ra.byteswap)
1475                         backup_byteswap(drr);
1476 
1477                 switch (drr->drr_type) {
1478                 case DRR_OBJECT:
1479                 {
1480                         /*
1481                          * We need to make a copy of the record header,
1482                          * because restore_{object,write} may need to
1483                          * restore_read(), which will invalidate drr.
1484                          */
1485                         struct drr_object drro = drr->drr_u.drr_object;
1486                         ra.err = restore_object(&ra, os, &drro);
1487                         break;
1488                 }
1489                 case DRR_FREEOBJECTS:
1490                 {
1491                         struct drr_freeobjects drrfo =
1492                             drr->drr_u.drr_freeobjects;
1493                         ra.err = restore_freeobjects(&ra, os, &drrfo);
1494                         break;
1495                 }
1496                 case DRR_WRITE:
1497                 {
1498                         struct drr_write drrw = drr->drr_u.drr_write;
1499                         ra.err = restore_write(&ra, os, &drrw);
1500                         break;
1501                 }
1502                 case DRR_WRITE_BYREF:
1503                 {
1504                         struct drr_write_byref drrwbr =
1505                             drr->drr_u.drr_write_byref;
1506                         ra.err = restore_write_byref(&ra, os, &drrwbr);
1507                         break;
1508                 }
1509                 case DRR_FREE:
1510                 {
1511                         struct drr_free drrf = drr->drr_u.drr_free;
1512                         ra.err = restore_free(&ra, os, &drrf);
1513                         break;
1514                 }
1515                 case DRR_END:
1516                 {
1517                         struct drr_end drre = drr->drr_u.drr_end;
1518                         /*
1519                          * We compare against the *previous* checksum
1520                          * value, because the stored checksum is of
1521                          * everything before the DRR_END record.
1522                          */
1523                         if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1524                                 ra.err = ECKSUM;
1525                         goto out;
1526                 }
1527                 case DRR_SPILL:
1528                 {
1529                         struct drr_spill drrs = drr->drr_u.drr_spill;
1530                         ra.err = restore_spill(&ra, os, &drrs);
1531                         break;
1532                 }
1533                 default:
1534                         ra.err = EINVAL;
1535                         goto out;
1536                 }
1537                 pcksum = ra.cksum;
1538         }
1539         ASSERT(ra.err != 0);
1540 
1541 out:
1542         if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
1543                 zfs_onexit_fd_rele(cleanup_fd);
1544 
1545         if (ra.err != 0) {
1546                 /*
1547                  * destroy what we created, so we don't leave it in the
1548                  * inconsistent restoring state.
1549                  */
1550                 txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
1551 
1552                 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1553                     B_FALSE);
1554                 if (drc->drc_real_ds != drc->drc_logical_ds) {
1555                         mutex_exit(&drc->drc_logical_ds->ds_recvlock);
1556                         dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
1557                 }
1558         }
1559 
1560         kmem_free(ra.buf, ra.bufsize);
1561         *voffp = ra.voff;
1562         return (ra.err);
1563 }
1564 
1565 struct recvendsyncarg {
1566         char *tosnap;
1567         uint64_t creation_time;
1568         uint64_t toguid;
1569 };
1570 
1571 static int
1572 recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
1573 {
1574         dsl_dataset_t *ds = arg1;
1575         struct recvendsyncarg *resa = arg2;
1576 
1577         return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
1578 }
1579 
1580 static void
1581 recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1582 {
1583         dsl_dataset_t *ds = arg1;
1584         struct recvendsyncarg *resa = arg2;
1585 
1586         dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
1587 
1588         /* set snapshot's creation time and guid */
1589         dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1590         ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
1591         ds->ds_prev->ds_phys->ds_guid = resa->toguid;
1592         ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1593 
1594         dmu_buf_will_dirty(ds->ds_dbuf, tx);
1595         ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1596         spa_history_log_internal_ds(ds, "finished receiving", tx, "");
1597 }
1598 
1599 static int
1600 add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
1601 {
1602         dsl_pool_t *dp = ds->ds_dir->dd_pool;
1603         uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
1604         dsl_dataset_t *snapds;
1605         guid_map_entry_t *gmep;
1606         int err;
1607 
1608         ASSERT(guid_map != NULL);
1609 
1610         rw_enter(&dp->dp_config_rwlock, RW_READER);
1611         err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
1612         if (err == 0) {
1613                 gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
1614                 gmep->guid = snapds->ds_phys->ds_guid;
1615                 gmep->gme_ds = snapds;
1616                 avl_add(guid_map, gmep);
1617         }
1618 
1619         rw_exit(&dp->dp_config_rwlock);
1620         return (err);
1621 }
1622 
1623 static int
1624 dmu_recv_existing_end(dmu_recv_cookie_t *drc)
1625 {
1626         struct recvendsyncarg resa;
1627         dsl_dataset_t *ds = drc->drc_logical_ds;
1628         int err, myerr;
1629 
1630         /*
1631          * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1632          * expects it to have a ds_user_ptr (and zil), but clone_swap()
1633          * can close it.
1634          */
1635         txg_wait_synced(ds->ds_dir->dd_pool, 0);
1636 
1637         if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
1638                 err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
1639                     drc->drc_force);
1640                 if (err)
1641                         goto out;
1642         } else {
1643                 mutex_exit(&ds->ds_recvlock);
1644                 dsl_dataset_rele(ds, dmu_recv_tag);
1645                 (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1646                     B_FALSE);
1647                 return (EBUSY);
1648         }
1649 
1650         resa.creation_time = drc->drc_drrb->drr_creation_time;
1651         resa.toguid = drc->drc_drrb->drr_toguid;
1652         resa.tosnap = drc->drc_tosnap;
1653 
1654         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1655             recv_end_check, recv_end_sync, ds, &resa, 3);
1656         if (err) {
1657                 /* swap back */
1658                 (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
1659         }
1660 
1661 out:
1662         mutex_exit(&ds->ds_recvlock);
1663         if (err == 0 && drc->drc_guid_to_ds_map != NULL)
1664                 (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1665         dsl_dataset_disown(ds, dmu_recv_tag);
1666         myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
1667         ASSERT3U(myerr, ==, 0);
1668         return (err);
1669 }
1670 
1671 static int
1672 dmu_recv_new_end(dmu_recv_cookie_t *drc)
1673 {
1674         struct recvendsyncarg resa;
1675         dsl_dataset_t *ds = drc->drc_logical_ds;
1676         int err;
1677 
1678         /*
1679          * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1680          * expects it to have a ds_user_ptr (and zil), but clone_swap()
1681          * can close it.
1682          */
1683         txg_wait_synced(ds->ds_dir->dd_pool, 0);
1684 
1685         resa.creation_time = drc->drc_drrb->drr_creation_time;
1686         resa.toguid = drc->drc_drrb->drr_toguid;
1687         resa.tosnap = drc->drc_tosnap;
1688 
1689         err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1690             recv_end_check, recv_end_sync, ds, &resa, 3);
1691         if (err) {
1692                 /* clean up the fs we just recv'd into */
1693                 (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
1694         } else {
1695                 if (drc->drc_guid_to_ds_map != NULL)
1696                         (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1697                 /* release the hold from dmu_recv_begin */
1698                 dsl_dataset_disown(ds, dmu_recv_tag);
1699         }
1700         return (err);
1701 }
1702 
1703 int
1704 dmu_recv_end(dmu_recv_cookie_t *drc)
1705 {
1706         if (drc->drc_logical_ds != drc->drc_real_ds)
1707                 return (dmu_recv_existing_end(drc));
1708         else
1709                 return (dmu_recv_new_end(drc));
1710 }