Print this page
    
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dmu_send.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_send.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright (c) 2011 by Delphix. All rights reserved.
  24      - */
  25      -/*
  26   23   * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  27      - * Copyright (c) 2011 by Delphix. All rights reserved.
       24 + * Copyright (c) 2012 by Delphix. All rights reserved.
  28   25   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  29   26   */
  30   27  
  31   28  #include <sys/dmu.h>
  32   29  #include <sys/dmu_impl.h>
  33   30  #include <sys/dmu_tx.h>
  34   31  #include <sys/dbuf.h>
  35   32  #include <sys/dnode.h>
  36   33  #include <sys/zfs_context.h>
  37   34  #include <sys/dmu_objset.h>
  38   35  #include <sys/dmu_traverse.h>
  39   36  #include <sys/dsl_dataset.h>
  40   37  #include <sys/dsl_dir.h>
  41   38  #include <sys/dsl_prop.h>
  42   39  #include <sys/dsl_pool.h>
  43   40  #include <sys/dsl_synctask.h>
  44   41  #include <sys/zfs_ioctl.h>
  45   42  #include <sys/zap.h>
  46   43  #include <sys/zio_checksum.h>
  47   44  #include <sys/zfs_znode.h>
  48   45  #include <zfs_fletcher.h>
  49   46  #include <sys/avl.h>
  50   47  #include <sys/ddt.h>
  51   48  #include <sys/zfs_onexit.h>
  52   49  
  53   50  /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
  54   51  int zfs_send_corrupt_data = B_FALSE;
  55   52  
  56   53  static char *dmu_recv_tag = "dmu_recv_tag";
  57   54  
  58   55  static int
  59   56  dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
  60   57  {
  61   58          dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
  62   59          ssize_t resid; /* have to get resid to get detailed errno */
  63   60          ASSERT3U(len % 8, ==, 0);
  64   61  
  65   62          fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
  66   63          dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
  67   64              (caddr_t)buf, len,
  68   65              0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
  69   66  
  70   67          mutex_enter(&ds->ds_sendstream_lock);
  71   68          *dsp->dsa_off += len;
  72   69          mutex_exit(&ds->ds_sendstream_lock);
  73   70  
  74   71          return (dsp->dsa_err);
  75   72  }
  76   73  
  77   74  static int
  78   75  dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
  79   76      uint64_t length)
  80   77  {
  81   78          struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
  82   79  
  83   80          /*
  84   81           * If there is a pending op, but it's not PENDING_FREE, push it out,
  85   82           * since free block aggregation can only be done for blocks of the
  86   83           * same type (i.e., DRR_FREE records can only be aggregated with
  87   84           * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
  88   85           * aggregated with other DRR_FREEOBJECTS records.
  89   86           */
  90   87          if (dsp->dsa_pending_op != PENDING_NONE &&
  91   88              dsp->dsa_pending_op != PENDING_FREE) {
  92   89                  if (dump_bytes(dsp, dsp->dsa_drr,
  93   90                      sizeof (dmu_replay_record_t)) != 0)
  94   91                          return (EINTR);
  95   92                  dsp->dsa_pending_op = PENDING_NONE;
  96   93          }
  97   94  
  98   95          if (dsp->dsa_pending_op == PENDING_FREE) {
  99   96                  /*
 100   97                   * There should never be a PENDING_FREE if length is -1
 101   98                   * (because dump_dnode is the only place where this
 102   99                   * function is called with a -1, and only after flushing
 103  100                   * any pending record).
 104  101                   */
 105  102                  ASSERT(length != -1ULL);
 106  103                  /*
 107  104                   * Check to see whether this free block can be aggregated
 108  105                   * with pending one.
 109  106                   */
 110  107                  if (drrf->drr_object == object && drrf->drr_offset +
 111  108                      drrf->drr_length == offset) {
 112  109                          drrf->drr_length += length;
 113  110                          return (0);
 114  111                  } else {
 115  112                          /* not a continuation.  Push out pending record */
 116  113                          if (dump_bytes(dsp, dsp->dsa_drr,
 117  114                              sizeof (dmu_replay_record_t)) != 0)
 118  115                                  return (EINTR);
 119  116                          dsp->dsa_pending_op = PENDING_NONE;
 120  117                  }
 121  118          }
 122  119          /* create a FREE record and make it pending */
 123  120          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 124  121          dsp->dsa_drr->drr_type = DRR_FREE;
 125  122          drrf->drr_object = object;
 126  123          drrf->drr_offset = offset;
 127  124          drrf->drr_length = length;
 128  125          drrf->drr_toguid = dsp->dsa_toguid;
 129  126          if (length == -1ULL) {
 130  127                  if (dump_bytes(dsp, dsp->dsa_drr,
 131  128                      sizeof (dmu_replay_record_t)) != 0)
 132  129                          return (EINTR);
 133  130          } else {
 134  131                  dsp->dsa_pending_op = PENDING_FREE;
 135  132          }
 136  133  
 137  134          return (0);
 138  135  }
 139  136  
 140  137  static int
 141  138  dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
 142  139      uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 143  140  {
 144  141          struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 145  142  
 146  143  
 147  144          /*
 148  145           * If there is any kind of pending aggregation (currently either
 149  146           * a grouping of free objects or free blocks), push it out to
 150  147           * the stream, since aggregation can't be done across operations
 151  148           * of different types.
 152  149           */
 153  150          if (dsp->dsa_pending_op != PENDING_NONE) {
 154  151                  if (dump_bytes(dsp, dsp->dsa_drr,
 155  152                      sizeof (dmu_replay_record_t)) != 0)
 156  153                          return (EINTR);
 157  154                  dsp->dsa_pending_op = PENDING_NONE;
 158  155          }
 159  156          /* write a DATA record */
 160  157          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 161  158          dsp->dsa_drr->drr_type = DRR_WRITE;
 162  159          drrw->drr_object = object;
 163  160          drrw->drr_type = type;
 164  161          drrw->drr_offset = offset;
 165  162          drrw->drr_length = blksz;
 166  163          drrw->drr_toguid = dsp->dsa_toguid;
 167  164          drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 168  165          if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
 169  166                  drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
 170  167          DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 171  168          DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 172  169          DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 173  170          drrw->drr_key.ddk_cksum = bp->blk_cksum;
 174  171  
 175  172          if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 176  173                  return (EINTR);
 177  174          if (dump_bytes(dsp, data, blksz) != 0)
 178  175                  return (EINTR);
 179  176          return (0);
 180  177  }
 181  178  
 182  179  static int
 183  180  dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 184  181  {
 185  182          struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 186  183  
 187  184          if (dsp->dsa_pending_op != PENDING_NONE) {
 188  185                  if (dump_bytes(dsp, dsp->dsa_drr,
 189  186                      sizeof (dmu_replay_record_t)) != 0)
 190  187                          return (EINTR);
 191  188                  dsp->dsa_pending_op = PENDING_NONE;
 192  189          }
 193  190  
 194  191          /* write a SPILL record */
 195  192          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 196  193          dsp->dsa_drr->drr_type = DRR_SPILL;
 197  194          drrs->drr_object = object;
 198  195          drrs->drr_length = blksz;
 199  196          drrs->drr_toguid = dsp->dsa_toguid;
 200  197  
 201  198          if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
 202  199                  return (EINTR);
 203  200          if (dump_bytes(dsp, data, blksz))
 204  201                  return (EINTR);
 205  202          return (0);
 206  203  }
 207  204  
 208  205  static int
 209  206  dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 210  207  {
 211  208          struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 212  209  
 213  210          /*
 214  211           * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 215  212           * push it out, since free block aggregation can only be done for
 216  213           * blocks of the same type (i.e., DRR_FREE records can only be
 217  214           * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 218  215           * can only be aggregated with other DRR_FREEOBJECTS records.
 219  216           */
 220  217          if (dsp->dsa_pending_op != PENDING_NONE &&
 221  218              dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
 222  219                  if (dump_bytes(dsp, dsp->dsa_drr,
 223  220                      sizeof (dmu_replay_record_t)) != 0)
 224  221                          return (EINTR);
 225  222                  dsp->dsa_pending_op = PENDING_NONE;
 226  223          }
 227  224          if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 228  225                  /*
 229  226                   * See whether this free object array can be aggregated
 230  227                   * with pending one
 231  228                   */
 232  229                  if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 233  230                          drrfo->drr_numobjs += numobjs;
 234  231                          return (0);
 235  232                  } else {
 236  233                          /* can't be aggregated.  Push out pending record */
 237  234                          if (dump_bytes(dsp, dsp->dsa_drr,
 238  235                              sizeof (dmu_replay_record_t)) != 0)
 239  236                                  return (EINTR);
 240  237                          dsp->dsa_pending_op = PENDING_NONE;
 241  238                  }
 242  239          }
 243  240  
 244  241          /* write a FREEOBJECTS record */
 245  242          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 246  243          dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 247  244          drrfo->drr_firstobj = firstobj;
 248  245          drrfo->drr_numobjs = numobjs;
 249  246          drrfo->drr_toguid = dsp->dsa_toguid;
 250  247  
 251  248          dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 252  249  
 253  250          return (0);
 254  251  }
 255  252  
 256  253  static int
 257  254  dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 258  255  {
 259  256          struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 260  257  
 261  258          if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 262  259                  return (dump_freeobjects(dsp, object, 1));
 263  260  
 264  261          if (dsp->dsa_pending_op != PENDING_NONE) {
 265  262                  if (dump_bytes(dsp, dsp->dsa_drr,
 266  263                      sizeof (dmu_replay_record_t)) != 0)
 267  264                          return (EINTR);
 268  265                  dsp->dsa_pending_op = PENDING_NONE;
 269  266          }
 270  267  
 271  268          /* write an OBJECT record */
 272  269          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 273  270          dsp->dsa_drr->drr_type = DRR_OBJECT;
 274  271          drro->drr_object = object;
 275  272          drro->drr_type = dnp->dn_type;
 276  273          drro->drr_bonustype = dnp->dn_bonustype;
 277  274          drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 278  275          drro->drr_bonuslen = dnp->dn_bonuslen;
 279  276          drro->drr_checksumtype = dnp->dn_checksum;
 280  277          drro->drr_compress = dnp->dn_compress;
 281  278          drro->drr_toguid = dsp->dsa_toguid;
 282  279  
 283  280          if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 284  281                  return (EINTR);
 285  282  
 286  283          if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
 287  284                  return (EINTR);
 288  285  
 289  286          /* free anything past the end of the file */
 290  287          if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 291  288              (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
 292  289                  return (EINTR);
 293  290          if (dsp->dsa_err)
 294  291                  return (EINTR);
 295  292          return (0);
 296  293  }
 297  294  
 298  295  #define BP_SPAN(dnp, level) \
 299  296          (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 300  297          (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 301  298  
 302  299  /* ARGSUSED */
 303  300  static int
 304  301  backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 305  302      const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 306  303  {
 307  304          dmu_sendarg_t *dsp = arg;
 308  305          dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 309  306          int err = 0;
 310  307  
 311  308          if (issig(JUSTLOOKING) && issig(FORREAL))
 312  309                  return (EINTR);
 313  310  
 314  311          if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 315  312              DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 316  313                  return (0);
 317  314          } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
 318  315                  uint64_t span = BP_SPAN(dnp, zb->zb_level);
 319  316                  uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 320  317                  err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
 321  318          } else if (bp == NULL) {
 322  319                  uint64_t span = BP_SPAN(dnp, zb->zb_level);
 323  320                  err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
 324  321          } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 325  322                  return (0);
 326  323          } else if (type == DMU_OT_DNODE) {
 327  324                  dnode_phys_t *blk;
 328  325                  int i;
 329  326                  int blksz = BP_GET_LSIZE(bp);
 330  327                  uint32_t aflags = ARC_WAIT;
 331  328                  arc_buf_t *abuf;
 332  329  
 333  330                  if (dsl_read(NULL, spa, bp, pbuf,
 334  331                      arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 335  332                      ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 336  333                          return (EIO);
 337  334  
 338  335                  blk = abuf->b_data;
 339  336                  for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
 340  337                          uint64_t dnobj = (zb->zb_blkid <<
 341  338                              (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
 342  339                          err = dump_dnode(dsp, dnobj, blk+i);
 343  340                          if (err)
 344  341                                  break;
 345  342                  }
 346  343                  (void) arc_buf_remove_ref(abuf, &abuf);
 347  344          } else if (type == DMU_OT_SA) {
 348  345                  uint32_t aflags = ARC_WAIT;
 349  346                  arc_buf_t *abuf;
 350  347                  int blksz = BP_GET_LSIZE(bp);
 351  348  
 352  349                  if (arc_read_nolock(NULL, spa, bp,
 353  350                      arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 354  351                      ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 355  352                          return (EIO);
 356  353  
 357  354                  err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
 358  355                  (void) arc_buf_remove_ref(abuf, &abuf);
 359  356          } else { /* it's a level-0 block of a regular object */
 360  357                  uint32_t aflags = ARC_WAIT;
 361  358                  arc_buf_t *abuf;
 362  359                  int blksz = BP_GET_LSIZE(bp);
 363  360  
 364  361                  if (dsl_read(NULL, spa, bp, pbuf,
 365  362                      arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 366  363                      ZIO_FLAG_CANFAIL, &aflags, zb) != 0) {
 367  364                          if (zfs_send_corrupt_data) {
 368  365                                  /* Send a block filled with 0x"zfs badd bloc" */
 369  366                                  abuf = arc_buf_alloc(spa, blksz, &abuf,
 370  367                                      ARC_BUFC_DATA);
 371  368                                  uint64_t *ptr;
 372  369                                  for (ptr = abuf->b_data;
 373  370                                      (char *)ptr < (char *)abuf->b_data + blksz;
 374  371                                      ptr++)
 375  372                                          *ptr = 0x2f5baddb10c;
 376  373                          } else {
 377  374                                  return (EIO);
 378  375                          }
 379  376                  }
 380  377  
 381  378                  err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
 382  379                      blksz, bp, abuf->b_data);
 383  380                  (void) arc_buf_remove_ref(abuf, &abuf);
 384  381          }
 385  382  
 386  383          ASSERT(err == 0 || err == EINTR);
 387  384          return (err);
 388  385  }
 389  386  
 390  387  int
 391  388  dmu_send(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
 392  389      int outfd, vnode_t *vp, offset_t *off)
 393  390  {
 394  391          dsl_dataset_t *ds = tosnap->os_dsl_dataset;
 395  392          dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 396  393          dmu_replay_record_t *drr;
 397  394          dmu_sendarg_t *dsp;
 398  395          int err;
 399  396          uint64_t fromtxg = 0;
 400  397  
 401  398          /* tosnap must be a snapshot */
 402  399          if (ds->ds_phys->ds_next_snap_obj == 0)
 403  400                  return (EINVAL);
 404  401  
 405  402          /* fromsnap must be an earlier snapshot from the same fs as tosnap */
 406  403          if (fromds && (ds->ds_dir != fromds->ds_dir ||
 407  404              fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
 408  405                  return (EXDEV);
 409  406  
 410  407          if (fromorigin) {
 411  408                  dsl_pool_t *dp = ds->ds_dir->dd_pool;
 412  409  
 413  410                  if (fromsnap)
 414  411                          return (EINVAL);
 415  412  
 416  413                  if (dsl_dir_is_clone(ds->ds_dir)) {
 417  414                          rw_enter(&dp->dp_config_rwlock, RW_READER);
 418  415                          err = dsl_dataset_hold_obj(dp,
 419  416                              ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
 420  417                          rw_exit(&dp->dp_config_rwlock);
 421  418                          if (err)
 422  419                                  return (err);
 423  420                  } else {
 424  421                          fromorigin = B_FALSE;
 425  422                  }
 426  423          }
 427  424  
 428  425  
 429  426          drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 430  427          drr->drr_type = DRR_BEGIN;
 431  428          drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 432  429          DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 433  430              DMU_SUBSTREAM);
 434  431  
 435  432  #ifdef _KERNEL
 436  433          if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
 437  434                  uint64_t version;
 438  435                  if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) {
 439  436                          kmem_free(drr, sizeof (dmu_replay_record_t));
 440  437                          return (EINVAL);
 441  438                  }
 442  439                  if (version == ZPL_VERSION_SA) {
 443  440                          DMU_SET_FEATUREFLAGS(
 444  441                              drr->drr_u.drr_begin.drr_versioninfo,
 445  442                              DMU_BACKUP_FEATURE_SA_SPILL);
 446  443                  }
 447  444          }
 448  445  #endif
 449  446  
 450  447          drr->drr_u.drr_begin.drr_creation_time =
 451  448              ds->ds_phys->ds_creation_time;
 452  449          drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
 453  450          if (fromorigin)
 454  451                  drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 455  452          drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
 456  453          if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 457  454                  drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 458  455  
 459  456          if (fromds)
 460  457                  drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
 461  458          dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 462  459  
 463  460          if (fromds)
 464  461                  fromtxg = fromds->ds_phys->ds_creation_txg;
 465  462          if (fromorigin)
 466  463                  dsl_dataset_rele(fromds, FTAG);
 467  464  
 468  465          dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
 469  466  
 470  467          dsp->dsa_drr = drr;
 471  468          dsp->dsa_vp = vp;
 472  469          dsp->dsa_outfd = outfd;
 473  470          dsp->dsa_proc = curproc;
 474  471          dsp->dsa_os = tosnap;
 475  472          dsp->dsa_off = off;
 476  473          dsp->dsa_toguid = ds->ds_phys->ds_guid;
 477  474          ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
 478  475          dsp->dsa_pending_op = PENDING_NONE;
 479  476  
 480  477          mutex_enter(&ds->ds_sendstream_lock);
 481  478          list_insert_head(&ds->ds_sendstreams, dsp);
 482  479          mutex_exit(&ds->ds_sendstream_lock);
 483  480  
 484  481          if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 485  482                  err = dsp->dsa_err;
 486  483                  goto out;
 487  484          }
 488  485  
 489  486          err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
 490  487              backup_cb, dsp);
 491  488  
 492  489          if (dsp->dsa_pending_op != PENDING_NONE)
 493  490                  if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
 494  491                          err = EINTR;
 495  492  
 496  493          if (err) {
 497  494                  if (err == EINTR && dsp->dsa_err)
 498  495                          err = dsp->dsa_err;
 499  496                  goto out;
 500  497          }
 501  498  
 502  499          bzero(drr, sizeof (dmu_replay_record_t));
 503  500          drr->drr_type = DRR_END;
 504  501          drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 505  502          drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 506  503  
 507  504          if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 508  505                  err = dsp->dsa_err;
 509  506                  goto out;
 510  507          }
 511  508  
 512  509  out:
 513  510          mutex_enter(&ds->ds_sendstream_lock);
 514  511          list_remove(&ds->ds_sendstreams, dsp);
 515  512          mutex_exit(&ds->ds_sendstream_lock);
 516  513  
 517  514          kmem_free(drr, sizeof (dmu_replay_record_t));
 518  515          kmem_free(dsp, sizeof (dmu_sendarg_t));
 519  516  
 520  517          return (err);
 521  518  }
 522  519  
 523  520  int
 524  521  dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
 525  522      uint64_t *sizep)
 526  523  {
 527  524          dsl_dataset_t *ds = tosnap->os_dsl_dataset;
 528  525          dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 529  526          dsl_pool_t *dp = ds->ds_dir->dd_pool;
 530  527          int err;
 531  528          uint64_t size;
 532  529  
 533  530          /* tosnap must be a snapshot */
 534  531          if (ds->ds_phys->ds_next_snap_obj == 0)
 535  532                  return (EINVAL);
 536  533  
 537  534          /* fromsnap must be an earlier snapshot from the same fs as tosnap */
 538  535          if (fromds && (ds->ds_dir != fromds->ds_dir ||
 539  536              fromds->ds_phys->ds_creation_txg >= ds->ds_phys->ds_creation_txg))
 540  537                  return (EXDEV);
 541  538  
 542  539          if (fromorigin) {
 543  540                  if (fromsnap)
 544  541                          return (EINVAL);
 545  542  
 546  543                  if (dsl_dir_is_clone(ds->ds_dir)) {
 547  544                          rw_enter(&dp->dp_config_rwlock, RW_READER);
 548  545                          err = dsl_dataset_hold_obj(dp,
 549  546                              ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &fromds);
 550  547                          rw_exit(&dp->dp_config_rwlock);
 551  548                          if (err)
 552  549                                  return (err);
 553  550                  } else {
 554  551                          fromorigin = B_FALSE;
 555  552                  }
 556  553          }
 557  554  
 558  555          /* Get uncompressed size estimate of changed data. */
 559  556          if (fromds == NULL) {
 560  557                  size = ds->ds_phys->ds_uncompressed_bytes;
 561  558          } else {
 562  559                  uint64_t used, comp;
 563  560                  err = dsl_dataset_space_written(fromds, ds,
 564  561                      &used, &comp, &size);
 565  562                  if (fromorigin)
 566  563                          dsl_dataset_rele(fromds, FTAG);
 567  564                  if (err)
 568  565                          return (err);
 569  566          }
 570  567  
 571  568          /*
 572  569           * Assume that space (both on-disk and in-stream) is dominated by
 573  570           * data.  We will adjust for indirect blocks and the copies property,
 574  571           * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
 575  572           */
 576  573  
 577  574          /*
 578  575           * Subtract out approximate space used by indirect blocks.
 579  576           * Assume most space is used by data blocks (non-indirect, non-dnode).
 580  577           * Assume all blocks are recordsize.  Assume ditto blocks and
 581  578           * internal fragmentation counter out compression.
 582  579           *
 583  580           * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
 584  581           * block, which we observe in practice.
 585  582           */
 586  583          uint64_t recordsize;
 587  584          rw_enter(&dp->dp_config_rwlock, RW_READER);
 588  585          err = dsl_prop_get_ds(ds, "recordsize",
 589  586              sizeof (recordsize), 1, &recordsize, NULL);
 590  587          rw_exit(&dp->dp_config_rwlock);
 591  588          if (err)
 592  589                  return (err);
 593  590          size -= size / recordsize * sizeof (blkptr_t);
 594  591  
 595  592          /* Add in the space for the record associated with each block. */
 596  593          size += size / recordsize * sizeof (dmu_replay_record_t);
 597  594  
 598  595          *sizep = size;
 599  596  
 600  597          return (0);
 601  598  }
 602  599  
 603  600  struct recvbeginsyncarg {
 604  601          const char *tofs;
 605  602          const char *tosnap;
 606  603          dsl_dataset_t *origin;
 607  604          uint64_t fromguid;
 608  605          dmu_objset_type_t type;
 609  606          void *tag;
 610  607          boolean_t force;
 611  608          uint64_t dsflags;
 612  609          char clonelastname[MAXNAMELEN];
 613  610          dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
 614  611          cred_t *cr;
 615  612  };
 616  613  
 617  614  /* ARGSUSED */
 618  615  static int
 619  616  recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 620  617  {
 621  618          dsl_dir_t *dd = arg1;
 622  619          struct recvbeginsyncarg *rbsa = arg2;
 623  620          objset_t *mos = dd->dd_pool->dp_meta_objset;
 624  621          uint64_t val;
 625  622          int err;
 626  623  
 627  624          err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
 628  625              strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
 629  626  
 630  627          if (err != ENOENT)
 631  628                  return (err ? err : EEXIST);
 632  629  
 633  630          if (rbsa->origin) {
 634  631                  /* make sure it's a snap in the same pool */
 635  632                  if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
 636  633                          return (EXDEV);
 637  634                  if (!dsl_dataset_is_snapshot(rbsa->origin))
 638  635                          return (EINVAL);
 639  636                  if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
 640  637                          return (ENODEV);
 641  638          }
 642  639  
 643  640          return (0);
 644  641  }
 645  642  
 646  643  static void
 647  644  recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 648  645  {
 649  646          dsl_dir_t *dd = arg1;
 650  647          struct recvbeginsyncarg *rbsa = arg2;
 651  648          uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 652  649          uint64_t dsobj;
 653  650  
 654  651          /* Create and open new dataset. */
 655  652          dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
 656  653              rbsa->origin, flags, rbsa->cr, tx);
 657  654          VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
 658  655              B_TRUE, dmu_recv_tag, &rbsa->ds));
 659  656  
 660  657          if (rbsa->origin == NULL) {
 661  658                  (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
 662  659                      rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
 663  660          }
 664  661  
 665  662          spa_history_log_internal(LOG_DS_REPLAY_FULL_SYNC,
 666  663              dd->dd_pool->dp_spa, tx, "dataset = %lld", dsobj);
 667  664  }
 668  665  
 669  666  /* ARGSUSED */
 670  667  static int
 671  668  recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 672  669  {
 673  670          dsl_dataset_t *ds = arg1;
 674  671          struct recvbeginsyncarg *rbsa = arg2;
 675  672          int err;
 676  673          uint64_t val;
 677  674  
 678  675          /* must not have any changes since most recent snapshot */
 679  676          if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
 680  677                  return (ETXTBSY);
 681  678  
 682  679          /* new snapshot name must not exist */
 683  680          err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 684  681              ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
 685  682          if (err == 0)
 686  683                  return (EEXIST);
 687  684          if (err != ENOENT)
 688  685                  return (err);
 689  686  
 690  687          if (rbsa->fromguid) {
 691  688                  /* if incremental, most recent snapshot must match fromguid */
 692  689                  if (ds->ds_prev == NULL)
 693  690                          return (ENODEV);
 694  691  
 695  692                  /*
 696  693                   * most recent snapshot must match fromguid, or there are no
 697  694                   * changes since the fromguid one
 698  695                   */
 699  696                  if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
 700  697                          uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
 701  698                          uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
 702  699                          while (obj != 0) {
 703  700                                  dsl_dataset_t *snap;
 704  701                                  err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 705  702                                      obj, FTAG, &snap);
 706  703                                  if (err)
 707  704                                          return (ENODEV);
 708  705                                  if (snap->ds_phys->ds_creation_txg < birth) {
 709  706                                          dsl_dataset_rele(snap, FTAG);
 710  707                                          return (ENODEV);
 711  708                                  }
 712  709                                  if (snap->ds_phys->ds_guid == rbsa->fromguid) {
 713  710                                          dsl_dataset_rele(snap, FTAG);
 714  711                                          break; /* it's ok */
 715  712                                  }
 716  713                                  obj = snap->ds_phys->ds_prev_snap_obj;
 717  714                                  dsl_dataset_rele(snap, FTAG);
 718  715                          }
 719  716                          if (obj == 0)
 720  717                                  return (ENODEV);
 721  718                  }
 722  719          } else {
 723  720                  /* if full, most recent snapshot must be $ORIGIN */
 724  721                  if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
 725  722                          return (ENODEV);
 726  723          }
 727  724  
 728  725          /* temporary clone name must not exist */
 729  726          err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 730  727              ds->ds_dir->dd_phys->dd_child_dir_zapobj,
 731  728              rbsa->clonelastname, 8, 1, &val);
 732  729          if (err == 0)
 733  730                  return (EEXIST);
 734  731          if (err != ENOENT)
 735  732                  return (err);
 736  733  
 737  734          return (0);
 738  735  }
 739  736  
 740  737  /* ARGSUSED */
 741  738  static void
 742  739  recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 743  740  {
 744  741          dsl_dataset_t *ohds = arg1;
 745  742          struct recvbeginsyncarg *rbsa = arg2;
 746  743          dsl_pool_t *dp = ohds->ds_dir->dd_pool;
 747  744          dsl_dataset_t *cds;
 748  745          uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 749  746          uint64_t dsobj;
 750  747  
 751  748          /* create and open the temporary clone */
 752  749          dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
 753  750              ohds->ds_prev, flags, rbsa->cr, tx);
 754  751          VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
 755  752  
 756  753          /*
 757  754           * If we actually created a non-clone, we need to create the
 758  755           * objset in our new dataset.
 759  756           */
 760  757          if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
 761  758                  (void) dmu_objset_create_impl(dp->dp_spa,
 762  759                      cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
 763  760          }
 764  761  
 765  762          rbsa->ds = cds;
 766  763  
 767  764          spa_history_log_internal(LOG_DS_REPLAY_INC_SYNC,
 768  765              dp->dp_spa, tx, "dataset = %lld", dsobj);
 769  766  }
 770  767  
 771  768  static boolean_t
 772  769  dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
 773  770  {
 774  771          int featureflags;
 775  772  
 776  773          featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 777  774  
 778  775          /* Verify pool version supports SA if SA_SPILL feature set */
 779  776          return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 780  777              (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
 781  778  }
 782  779  
 783  780  /*
 784  781   * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
 785  782   * succeeds; otherwise we will leak the holds on the datasets.
 786  783   */
 787  784  int
 788  785  dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
 789  786      boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
 790  787  {
 791  788          int err = 0;
 792  789          boolean_t byteswap;
 793  790          struct recvbeginsyncarg rbsa = { 0 };
 794  791          uint64_t versioninfo;
 795  792          int flags;
 796  793          dsl_dataset_t *ds;
 797  794  
 798  795          if (drrb->drr_magic == DMU_BACKUP_MAGIC)
 799  796                  byteswap = FALSE;
 800  797          else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
 801  798                  byteswap = TRUE;
 802  799          else
 803  800                  return (EINVAL);
 804  801  
 805  802          rbsa.tofs = tofs;
 806  803          rbsa.tosnap = tosnap;
 807  804          rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
 808  805          rbsa.fromguid = drrb->drr_fromguid;
 809  806          rbsa.type = drrb->drr_type;
 810  807          rbsa.tag = FTAG;
 811  808          rbsa.dsflags = 0;
 812  809          rbsa.cr = CRED();
 813  810          versioninfo = drrb->drr_versioninfo;
 814  811          flags = drrb->drr_flags;
 815  812  
 816  813          if (byteswap) {
 817  814                  rbsa.type = BSWAP_32(rbsa.type);
 818  815                  rbsa.fromguid = BSWAP_64(rbsa.fromguid);
 819  816                  versioninfo = BSWAP_64(versioninfo);
 820  817                  flags = BSWAP_32(flags);
 821  818          }
 822  819  
 823  820          if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
 824  821              rbsa.type >= DMU_OST_NUMTYPES ||
 825  822              ((flags & DRR_FLAG_CLONE) && origin == NULL))
 826  823                  return (EINVAL);
 827  824  
 828  825          if (flags & DRR_FLAG_CI_DATA)
 829  826                  rbsa.dsflags = DS_FLAG_CI_DATASET;
 830  827  
 831  828          bzero(drc, sizeof (dmu_recv_cookie_t));
 832  829          drc->drc_drrb = drrb;
 833  830          drc->drc_tosnap = tosnap;
 834  831          drc->drc_top_ds = top_ds;
 835  832          drc->drc_force = force;
 836  833  
 837  834          /*
 838  835           * Process the begin in syncing context.
 839  836           */
 840  837  
 841  838          /* open the dataset we are logically receiving into */
 842  839          err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
 843  840          if (err == 0) {
 844  841                  if (dmu_recv_verify_features(ds, drrb)) {
 845  842                          dsl_dataset_rele(ds, dmu_recv_tag);
 846  843                          return (ENOTSUP);
 847  844                  }
 848  845                  /* target fs already exists; recv into temp clone */
 849  846  
 850  847                  /* Can't recv a clone into an existing fs */
 851  848                  if (flags & DRR_FLAG_CLONE) {
 852  849                          dsl_dataset_rele(ds, dmu_recv_tag);
 853  850                          return (EINVAL);
 854  851                  }
 855  852  
 856  853                  /* must not have an incremental recv already in progress */
 857  854                  if (!mutex_tryenter(&ds->ds_recvlock)) {
 858  855                          dsl_dataset_rele(ds, dmu_recv_tag);
 859  856                          return (EBUSY);
 860  857                  }
 861  858  
 862  859                  /* tmp clone name is: tofs/%tosnap" */
 863  860                  (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
 864  861                      "%%%s", tosnap);
 865  862                  rbsa.force = force;
 866  863                  err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 867  864                      recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
 868  865                  if (err) {
 869  866                          mutex_exit(&ds->ds_recvlock);
 870  867                          dsl_dataset_rele(ds, dmu_recv_tag);
 871  868                          return (err);
 872  869                  }
 873  870                  drc->drc_logical_ds = ds;
 874  871                  drc->drc_real_ds = rbsa.ds;
 875  872          } else if (err == ENOENT) {
 876  873                  /* target fs does not exist; must be a full backup or clone */
 877  874                  char *cp;
 878  875  
 879  876                  /*
 880  877                   * If it's a non-clone incremental, we are missing the
 881  878                   * target fs, so fail the recv.
 882  879                   */
 883  880                  if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
 884  881                          return (ENOENT);
 885  882  
 886  883                  /* Open the parent of tofs */
 887  884                  cp = strrchr(tofs, '/');
 888  885                  *cp = '\0';
 889  886                  err = dsl_dataset_hold(tofs, FTAG, &ds);
 890  887                  *cp = '/';
 891  888                  if (err)
 892  889                          return (err);
 893  890  
 894  891                  if (dmu_recv_verify_features(ds, drrb)) {
 895  892                          dsl_dataset_rele(ds, FTAG);
 896  893                          return (ENOTSUP);
 897  894                  }
 898  895  
 899  896                  err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 900  897                      recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
 901  898                  dsl_dataset_rele(ds, FTAG);
 902  899                  if (err)
 903  900                          return (err);
 904  901                  drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
 905  902                  drc->drc_newfs = B_TRUE;
 906  903          }
 907  904  
 908  905          return (err);
 909  906  }
 910  907  
 911  908  struct restorearg {
 912  909          int err;
 913  910          int byteswap;
 914  911          vnode_t *vp;
 915  912          char *buf;
 916  913          uint64_t voff;
 917  914          int bufsize; /* amount of memory allocated for buf */
 918  915          zio_cksum_t cksum;
 919  916          avl_tree_t *guid_to_ds_map;
 920  917  };
 921  918  
 922  919  typedef struct guid_map_entry {
 923  920          uint64_t        guid;
 924  921          dsl_dataset_t   *gme_ds;
 925  922          avl_node_t      avlnode;
 926  923  } guid_map_entry_t;
 927  924  
 928  925  static int
 929  926  guid_compare(const void *arg1, const void *arg2)
 930  927  {
 931  928          const guid_map_entry_t *gmep1 = arg1;
 932  929          const guid_map_entry_t *gmep2 = arg2;
 933  930  
 934  931          if (gmep1->guid < gmep2->guid)
 935  932                  return (-1);
 936  933          else if (gmep1->guid > gmep2->guid)
 937  934                  return (1);
 938  935          return (0);
 939  936  }
 940  937  
 941  938  static void
 942  939  free_guid_map_onexit(void *arg)
 943  940  {
 944  941          avl_tree_t *ca = arg;
 945  942          void *cookie = NULL;
 946  943          guid_map_entry_t *gmep;
 947  944  
 948  945          while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
 949  946                  dsl_dataset_rele(gmep->gme_ds, ca);
 950  947                  kmem_free(gmep, sizeof (guid_map_entry_t));
 951  948          }
 952  949          avl_destroy(ca);
 953  950          kmem_free(ca, sizeof (avl_tree_t));
 954  951  }
 955  952  
 956  953  static void *
 957  954  restore_read(struct restorearg *ra, int len)
 958  955  {
 959  956          void *rv;
 960  957          int done = 0;
 961  958  
 962  959          /* some things will require 8-byte alignment, so everything must */
 963  960          ASSERT3U(len % 8, ==, 0);
 964  961  
 965  962          while (done < len) {
 966  963                  ssize_t resid;
 967  964  
 968  965                  ra->err = vn_rdwr(UIO_READ, ra->vp,
 969  966                      (caddr_t)ra->buf + done, len - done,
 970  967                      ra->voff, UIO_SYSSPACE, FAPPEND,
 971  968                      RLIM64_INFINITY, CRED(), &resid);
 972  969  
 973  970                  if (resid == len - done)
 974  971                          ra->err = EINVAL;
 975  972                  ra->voff += len - done - resid;
 976  973                  done = len - resid;
 977  974                  if (ra->err)
 978  975                          return (NULL);
 979  976          }
 980  977  
 981  978          ASSERT3U(done, ==, len);
 982  979          rv = ra->buf;
 983  980          if (ra->byteswap)
 984  981                  fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
 985  982          else
 986  983                  fletcher_4_incremental_native(rv, len, &ra->cksum);
 987  984          return (rv);
 988  985  }
 989  986  
 990  987  static void
 991  988  backup_byteswap(dmu_replay_record_t *drr)
 992  989  {
 993  990  #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 994  991  #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 995  992          drr->drr_type = BSWAP_32(drr->drr_type);
 996  993          drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 997  994          switch (drr->drr_type) {
 998  995          case DRR_BEGIN:
 999  996                  DO64(drr_begin.drr_magic);
1000  997                  DO64(drr_begin.drr_versioninfo);
1001  998                  DO64(drr_begin.drr_creation_time);
1002  999                  DO32(drr_begin.drr_type);
1003 1000                  DO32(drr_begin.drr_flags);
1004 1001                  DO64(drr_begin.drr_toguid);
1005 1002                  DO64(drr_begin.drr_fromguid);
1006 1003                  break;
1007 1004          case DRR_OBJECT:
1008 1005                  DO64(drr_object.drr_object);
1009 1006                  /* DO64(drr_object.drr_allocation_txg); */
1010 1007                  DO32(drr_object.drr_type);
1011 1008                  DO32(drr_object.drr_bonustype);
1012 1009                  DO32(drr_object.drr_blksz);
1013 1010                  DO32(drr_object.drr_bonuslen);
1014 1011                  DO64(drr_object.drr_toguid);
1015 1012                  break;
1016 1013          case DRR_FREEOBJECTS:
1017 1014                  DO64(drr_freeobjects.drr_firstobj);
1018 1015                  DO64(drr_freeobjects.drr_numobjs);
1019 1016                  DO64(drr_freeobjects.drr_toguid);
1020 1017                  break;
1021 1018          case DRR_WRITE:
1022 1019                  DO64(drr_write.drr_object);
1023 1020                  DO32(drr_write.drr_type);
1024 1021                  DO64(drr_write.drr_offset);
1025 1022                  DO64(drr_write.drr_length);
1026 1023                  DO64(drr_write.drr_toguid);
1027 1024                  DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
1028 1025                  DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
1029 1026                  DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
1030 1027                  DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
1031 1028                  DO64(drr_write.drr_key.ddk_prop);
1032 1029                  break;
1033 1030          case DRR_WRITE_BYREF:
1034 1031                  DO64(drr_write_byref.drr_object);
1035 1032                  DO64(drr_write_byref.drr_offset);
1036 1033                  DO64(drr_write_byref.drr_length);
1037 1034                  DO64(drr_write_byref.drr_toguid);
1038 1035                  DO64(drr_write_byref.drr_refguid);
1039 1036                  DO64(drr_write_byref.drr_refobject);
1040 1037                  DO64(drr_write_byref.drr_refoffset);
1041 1038                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
1042 1039                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
1043 1040                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
1044 1041                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
1045 1042                  DO64(drr_write_byref.drr_key.ddk_prop);
1046 1043                  break;
1047 1044          case DRR_FREE:
1048 1045                  DO64(drr_free.drr_object);
1049 1046                  DO64(drr_free.drr_offset);
1050 1047                  DO64(drr_free.drr_length);
1051 1048                  DO64(drr_free.drr_toguid);
1052 1049                  break;
1053 1050          case DRR_SPILL:
1054 1051                  DO64(drr_spill.drr_object);
1055 1052                  DO64(drr_spill.drr_length);
1056 1053                  DO64(drr_spill.drr_toguid);
1057 1054                  break;
1058 1055          case DRR_END:
1059 1056                  DO64(drr_end.drr_checksum.zc_word[0]);
1060 1057                  DO64(drr_end.drr_checksum.zc_word[1]);
1061 1058                  DO64(drr_end.drr_checksum.zc_word[2]);
1062 1059                  DO64(drr_end.drr_checksum.zc_word[3]);
1063 1060                  DO64(drr_end.drr_toguid);
1064 1061                  break;
1065 1062          }
1066 1063  #undef DO64
1067 1064  #undef DO32
  
    | 
      ↓ open down ↓ | 
    1030 lines elided | 
    
      ↑ open up ↑ | 
  
1068 1065  }
1069 1066  
1070 1067  static int
1071 1068  restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1072 1069  {
1073 1070          int err;
1074 1071          dmu_tx_t *tx;
1075 1072          void *data = NULL;
1076 1073  
1077 1074          if (drro->drr_type == DMU_OT_NONE ||
1078      -            drro->drr_type >= DMU_OT_NUMTYPES ||
1079      -            drro->drr_bonustype >= DMU_OT_NUMTYPES ||
     1075 +            !DMU_OT_IS_VALID(drro->drr_type) ||
     1076 +            !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1080 1077              drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1081 1078              drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1082 1079              P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1083 1080              drro->drr_blksz < SPA_MINBLOCKSIZE ||
1084 1081              drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1085 1082              drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1086 1083                  return (EINVAL);
1087 1084          }
1088 1085  
1089 1086          err = dmu_object_info(os, drro->drr_object, NULL);
1090 1087  
1091 1088          if (err != 0 && err != ENOENT)
1092 1089                  return (EINVAL);
1093 1090  
1094 1091          if (drro->drr_bonuslen) {
1095 1092                  data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1096 1093                  if (ra->err)
1097 1094                          return (ra->err);
1098 1095          }
1099 1096  
1100 1097          if (err == ENOENT) {
1101 1098                  /* currently free, want to be allocated */
1102 1099                  tx = dmu_tx_create(os);
1103 1100                  dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1104 1101                  err = dmu_tx_assign(tx, TXG_WAIT);
1105 1102                  if (err) {
1106 1103                          dmu_tx_abort(tx);
1107 1104                          return (err);
1108 1105                  }
1109 1106                  err = dmu_object_claim(os, drro->drr_object,
1110 1107                      drro->drr_type, drro->drr_blksz,
1111 1108                      drro->drr_bonustype, drro->drr_bonuslen, tx);
1112 1109                  dmu_tx_commit(tx);
1113 1110          } else {
1114 1111                  /* currently allocated, want to be allocated */
1115 1112                  err = dmu_object_reclaim(os, drro->drr_object,
1116 1113                      drro->drr_type, drro->drr_blksz,
1117 1114                      drro->drr_bonustype, drro->drr_bonuslen);
1118 1115          }
1119 1116          if (err) {
1120 1117                  return (EINVAL);
1121 1118          }
1122 1119  
1123 1120          tx = dmu_tx_create(os);
1124 1121          dmu_tx_hold_bonus(tx, drro->drr_object);
1125 1122          err = dmu_tx_assign(tx, TXG_WAIT);
1126 1123          if (err) {
1127 1124                  dmu_tx_abort(tx);
1128 1125                  return (err);
1129 1126          }
1130 1127  
1131 1128          dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1132 1129              tx);
1133 1130          dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
  
    | 
      ↓ open down ↓ | 
    44 lines elided | 
    
      ↑ open up ↑ | 
  
1134 1131  
1135 1132          if (data != NULL) {
1136 1133                  dmu_buf_t *db;
1137 1134  
1138 1135                  VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1139 1136                  dmu_buf_will_dirty(db, tx);
1140 1137  
1141 1138                  ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1142 1139                  bcopy(data, db->db_data, drro->drr_bonuslen);
1143 1140                  if (ra->byteswap) {
1144      -                        dmu_ot[drro->drr_bonustype].ot_byteswap(db->db_data,
     1141 +                        dmu_object_byteswap_t byteswap =
     1142 +                            DMU_OT_BYTESWAP(drro->drr_bonustype);
     1143 +                        dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1145 1144                              drro->drr_bonuslen);
1146 1145                  }
1147 1146                  dmu_buf_rele(db, FTAG);
1148 1147          }
1149 1148          dmu_tx_commit(tx);
1150 1149          return (0);
1151 1150  }
1152 1151  
1153 1152  /* ARGSUSED */
1154 1153  static int
1155 1154  restore_freeobjects(struct restorearg *ra, objset_t *os,
1156 1155      struct drr_freeobjects *drrfo)
1157 1156  {
1158 1157          uint64_t obj;
1159 1158  
1160 1159          if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1161 1160                  return (EINVAL);
1162 1161  
1163 1162          for (obj = drrfo->drr_firstobj;
1164 1163              obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1165 1164              (void) dmu_object_next(os, &obj, FALSE, 0)) {
1166 1165                  int err;
1167 1166  
1168 1167                  if (dmu_object_info(os, obj, NULL) != 0)
1169 1168                          continue;
1170 1169  
1171 1170                  err = dmu_free_object(os, obj);
1172 1171                  if (err)
1173 1172                          return (err);
1174 1173          }
1175 1174          return (0);
1176 1175  }
  
    | 
      ↓ open down ↓ | 
    22 lines elided | 
    
      ↑ open up ↑ | 
  
1177 1176  
1178 1177  static int
1179 1178  restore_write(struct restorearg *ra, objset_t *os,
1180 1179      struct drr_write *drrw)
1181 1180  {
1182 1181          dmu_tx_t *tx;
1183 1182          void *data;
1184 1183          int err;
1185 1184  
1186 1185          if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1187      -            drrw->drr_type >= DMU_OT_NUMTYPES)
     1186 +            !DMU_OT_IS_VALID(drrw->drr_type))
1188 1187                  return (EINVAL);
1189 1188  
1190 1189          data = restore_read(ra, drrw->drr_length);
1191 1190          if (data == NULL)
1192 1191                  return (ra->err);
1193 1192  
1194 1193          if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1195 1194                  return (EINVAL);
1196 1195  
1197 1196          tx = dmu_tx_create(os);
1198 1197  
1199 1198          dmu_tx_hold_write(tx, drrw->drr_object,
1200 1199              drrw->drr_offset, drrw->drr_length);
1201 1200          err = dmu_tx_assign(tx, TXG_WAIT);
1202 1201          if (err) {
1203 1202                  dmu_tx_abort(tx);
1204 1203                  return (err);
1205 1204          }
1206      -        if (ra->byteswap)
1207      -                dmu_ot[drrw->drr_type].ot_byteswap(data, drrw->drr_length);
     1205 +        if (ra->byteswap) {
     1206 +                dmu_object_byteswap_t byteswap =
     1207 +                    DMU_OT_BYTESWAP(drrw->drr_type);
     1208 +                dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
     1209 +        }
1208 1210          dmu_write(os, drrw->drr_object,
1209 1211              drrw->drr_offset, drrw->drr_length, data, tx);
1210 1212          dmu_tx_commit(tx);
1211 1213          return (0);
1212 1214  }
1213 1215  
1214 1216  /*
1215 1217   * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1216 1218   * streams to refer to a copy of the data that is already on the
1217 1219   * system because it came in earlier in the stream.  This function
1218 1220   * finds the earlier copy of the data, and uses that copy instead of
1219 1221   * data from the stream to fulfill this write.
1220 1222   */
1221 1223  static int
1222 1224  restore_write_byref(struct restorearg *ra, objset_t *os,
1223 1225      struct drr_write_byref *drrwbr)
1224 1226  {
1225 1227          dmu_tx_t *tx;
1226 1228          int err;
1227 1229          guid_map_entry_t gmesrch;
1228 1230          guid_map_entry_t *gmep;
1229 1231          avl_index_t     where;
1230 1232          objset_t *ref_os = NULL;
1231 1233          dmu_buf_t *dbp;
1232 1234  
1233 1235          if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1234 1236                  return (EINVAL);
1235 1237  
1236 1238          /*
1237 1239           * If the GUID of the referenced dataset is different from the
1238 1240           * GUID of the target dataset, find the referenced dataset.
1239 1241           */
1240 1242          if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1241 1243                  gmesrch.guid = drrwbr->drr_refguid;
1242 1244                  if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
1243 1245                      &where)) == NULL) {
1244 1246                          return (EINVAL);
1245 1247                  }
1246 1248                  if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1247 1249                          return (EINVAL);
1248 1250          } else {
1249 1251                  ref_os = os;
1250 1252          }
1251 1253  
1252 1254          if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1253 1255              drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
1254 1256                  return (err);
1255 1257  
1256 1258          tx = dmu_tx_create(os);
1257 1259  
1258 1260          dmu_tx_hold_write(tx, drrwbr->drr_object,
1259 1261              drrwbr->drr_offset, drrwbr->drr_length);
1260 1262          err = dmu_tx_assign(tx, TXG_WAIT);
1261 1263          if (err) {
1262 1264                  dmu_tx_abort(tx);
1263 1265                  return (err);
1264 1266          }
1265 1267          dmu_write(os, drrwbr->drr_object,
1266 1268              drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1267 1269          dmu_buf_rele(dbp, FTAG);
1268 1270          dmu_tx_commit(tx);
1269 1271          return (0);
1270 1272  }
1271 1273  
1272 1274  static int
1273 1275  restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1274 1276  {
1275 1277          dmu_tx_t *tx;
1276 1278          void *data;
1277 1279          dmu_buf_t *db, *db_spill;
1278 1280          int err;
1279 1281  
1280 1282          if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1281 1283              drrs->drr_length > SPA_MAXBLOCKSIZE)
1282 1284                  return (EINVAL);
1283 1285  
1284 1286          data = restore_read(ra, drrs->drr_length);
1285 1287          if (data == NULL)
1286 1288                  return (ra->err);
1287 1289  
1288 1290          if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1289 1291                  return (EINVAL);
1290 1292  
1291 1293          VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1292 1294          if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1293 1295                  dmu_buf_rele(db, FTAG);
1294 1296                  return (err);
1295 1297          }
1296 1298  
1297 1299          tx = dmu_tx_create(os);
1298 1300  
1299 1301          dmu_tx_hold_spill(tx, db->db_object);
1300 1302  
1301 1303          err = dmu_tx_assign(tx, TXG_WAIT);
1302 1304          if (err) {
1303 1305                  dmu_buf_rele(db, FTAG);
1304 1306                  dmu_buf_rele(db_spill, FTAG);
1305 1307                  dmu_tx_abort(tx);
1306 1308                  return (err);
1307 1309          }
1308 1310          dmu_buf_will_dirty(db_spill, tx);
1309 1311  
1310 1312          if (db_spill->db_size < drrs->drr_length)
1311 1313                  VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1312 1314                      drrs->drr_length, tx));
1313 1315          bcopy(data, db_spill->db_data, drrs->drr_length);
1314 1316  
1315 1317          dmu_buf_rele(db, FTAG);
1316 1318          dmu_buf_rele(db_spill, FTAG);
1317 1319  
1318 1320          dmu_tx_commit(tx);
1319 1321          return (0);
1320 1322  }
1321 1323  
1322 1324  /* ARGSUSED */
1323 1325  static int
1324 1326  restore_free(struct restorearg *ra, objset_t *os,
1325 1327      struct drr_free *drrf)
1326 1328  {
1327 1329          int err;
1328 1330  
1329 1331          if (drrf->drr_length != -1ULL &&
1330 1332              drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1331 1333                  return (EINVAL);
1332 1334  
1333 1335          if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1334 1336                  return (EINVAL);
1335 1337  
1336 1338          err = dmu_free_long_range(os, drrf->drr_object,
1337 1339              drrf->drr_offset, drrf->drr_length);
1338 1340          return (err);
1339 1341  }
1340 1342  
1341 1343  /*
1342 1344   * NB: callers *must* call dmu_recv_end() if this succeeds.
1343 1345   */
1344 1346  int
1345 1347  dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
1346 1348      int cleanup_fd, uint64_t *action_handlep)
1347 1349  {
1348 1350          struct restorearg ra = { 0 };
1349 1351          dmu_replay_record_t *drr;
1350 1352          objset_t *os;
1351 1353          zio_cksum_t pcksum;
1352 1354          int featureflags;
1353 1355  
1354 1356          if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1355 1357                  ra.byteswap = TRUE;
1356 1358  
1357 1359          {
1358 1360                  /* compute checksum of drr_begin record */
1359 1361                  dmu_replay_record_t *drr;
1360 1362                  drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1361 1363  
1362 1364                  drr->drr_type = DRR_BEGIN;
1363 1365                  drr->drr_u.drr_begin = *drc->drc_drrb;
1364 1366                  if (ra.byteswap) {
1365 1367                          fletcher_4_incremental_byteswap(drr,
1366 1368                              sizeof (dmu_replay_record_t), &ra.cksum);
1367 1369                  } else {
1368 1370                          fletcher_4_incremental_native(drr,
1369 1371                              sizeof (dmu_replay_record_t), &ra.cksum);
1370 1372                  }
1371 1373                  kmem_free(drr, sizeof (dmu_replay_record_t));
1372 1374          }
1373 1375  
1374 1376          if (ra.byteswap) {
1375 1377                  struct drr_begin *drrb = drc->drc_drrb;
1376 1378                  drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1377 1379                  drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1378 1380                  drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1379 1381                  drrb->drr_type = BSWAP_32(drrb->drr_type);
1380 1382                  drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1381 1383                  drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1382 1384          }
1383 1385  
1384 1386          ra.vp = vp;
1385 1387          ra.voff = *voffp;
1386 1388          ra.bufsize = 1<<20;
1387 1389          ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1388 1390  
1389 1391          /* these were verified in dmu_recv_begin */
1390 1392          ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
1391 1393              DMU_SUBSTREAM);
1392 1394          ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
1393 1395  
1394 1396          /*
1395 1397           * Open the objset we are modifying.
1396 1398           */
1397 1399          VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
1398 1400  
1399 1401          ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1400 1402  
1401 1403          featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1402 1404  
1403 1405          /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1404 1406          if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1405 1407                  minor_t minor;
1406 1408  
1407 1409                  if (cleanup_fd == -1) {
1408 1410                          ra.err = EBADF;
1409 1411                          goto out;
1410 1412                  }
1411 1413                  ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
1412 1414                  if (ra.err) {
1413 1415                          cleanup_fd = -1;
1414 1416                          goto out;
1415 1417                  }
1416 1418  
1417 1419                  if (*action_handlep == 0) {
1418 1420                          ra.guid_to_ds_map =
1419 1421                              kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1420 1422                          avl_create(ra.guid_to_ds_map, guid_compare,
1421 1423                              sizeof (guid_map_entry_t),
1422 1424                              offsetof(guid_map_entry_t, avlnode));
1423 1425                          ra.err = zfs_onexit_add_cb(minor,
1424 1426                              free_guid_map_onexit, ra.guid_to_ds_map,
1425 1427                              action_handlep);
1426 1428                          if (ra.err)
1427 1429                                  goto out;
1428 1430                  } else {
1429 1431                          ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1430 1432                              (void **)&ra.guid_to_ds_map);
1431 1433                          if (ra.err)
1432 1434                                  goto out;
1433 1435                  }
1434 1436  
1435 1437                  drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
1436 1438          }
1437 1439  
1438 1440          /*
1439 1441           * Read records and process them.
1440 1442           */
1441 1443          pcksum = ra.cksum;
1442 1444          while (ra.err == 0 &&
1443 1445              NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1444 1446                  if (issig(JUSTLOOKING) && issig(FORREAL)) {
1445 1447                          ra.err = EINTR;
1446 1448                          goto out;
1447 1449                  }
1448 1450  
1449 1451                  if (ra.byteswap)
1450 1452                          backup_byteswap(drr);
1451 1453  
1452 1454                  switch (drr->drr_type) {
1453 1455                  case DRR_OBJECT:
1454 1456                  {
1455 1457                          /*
1456 1458                           * We need to make a copy of the record header,
1457 1459                           * because restore_{object,write} may need to
1458 1460                           * restore_read(), which will invalidate drr.
1459 1461                           */
1460 1462                          struct drr_object drro = drr->drr_u.drr_object;
1461 1463                          ra.err = restore_object(&ra, os, &drro);
1462 1464                          break;
1463 1465                  }
1464 1466                  case DRR_FREEOBJECTS:
1465 1467                  {
1466 1468                          struct drr_freeobjects drrfo =
1467 1469                              drr->drr_u.drr_freeobjects;
1468 1470                          ra.err = restore_freeobjects(&ra, os, &drrfo);
1469 1471                          break;
1470 1472                  }
1471 1473                  case DRR_WRITE:
1472 1474                  {
1473 1475                          struct drr_write drrw = drr->drr_u.drr_write;
1474 1476                          ra.err = restore_write(&ra, os, &drrw);
1475 1477                          break;
1476 1478                  }
1477 1479                  case DRR_WRITE_BYREF:
1478 1480                  {
1479 1481                          struct drr_write_byref drrwbr =
1480 1482                              drr->drr_u.drr_write_byref;
1481 1483                          ra.err = restore_write_byref(&ra, os, &drrwbr);
1482 1484                          break;
1483 1485                  }
1484 1486                  case DRR_FREE:
1485 1487                  {
1486 1488                          struct drr_free drrf = drr->drr_u.drr_free;
1487 1489                          ra.err = restore_free(&ra, os, &drrf);
1488 1490                          break;
1489 1491                  }
1490 1492                  case DRR_END:
1491 1493                  {
1492 1494                          struct drr_end drre = drr->drr_u.drr_end;
1493 1495                          /*
1494 1496                           * We compare against the *previous* checksum
1495 1497                           * value, because the stored checksum is of
1496 1498                           * everything before the DRR_END record.
1497 1499                           */
1498 1500                          if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1499 1501                                  ra.err = ECKSUM;
1500 1502                          goto out;
1501 1503                  }
1502 1504                  case DRR_SPILL:
1503 1505                  {
1504 1506                          struct drr_spill drrs = drr->drr_u.drr_spill;
1505 1507                          ra.err = restore_spill(&ra, os, &drrs);
1506 1508                          break;
1507 1509                  }
1508 1510                  default:
1509 1511                          ra.err = EINVAL;
1510 1512                          goto out;
1511 1513                  }
1512 1514                  pcksum = ra.cksum;
1513 1515          }
1514 1516          ASSERT(ra.err != 0);
1515 1517  
1516 1518  out:
1517 1519          if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
1518 1520                  zfs_onexit_fd_rele(cleanup_fd);
1519 1521  
1520 1522          if (ra.err != 0) {
1521 1523                  /*
1522 1524                   * destroy what we created, so we don't leave it in the
1523 1525                   * inconsistent restoring state.
1524 1526                   */
1525 1527                  txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
1526 1528  
1527 1529                  (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1528 1530                      B_FALSE);
1529 1531                  if (drc->drc_real_ds != drc->drc_logical_ds) {
1530 1532                          mutex_exit(&drc->drc_logical_ds->ds_recvlock);
1531 1533                          dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
1532 1534                  }
1533 1535          }
1534 1536  
1535 1537          kmem_free(ra.buf, ra.bufsize);
1536 1538          *voffp = ra.voff;
1537 1539          return (ra.err);
1538 1540  }
1539 1541  
1540 1542  struct recvendsyncarg {
1541 1543          char *tosnap;
1542 1544          uint64_t creation_time;
1543 1545          uint64_t toguid;
1544 1546  };
1545 1547  
1546 1548  static int
1547 1549  recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
1548 1550  {
1549 1551          dsl_dataset_t *ds = arg1;
1550 1552          struct recvendsyncarg *resa = arg2;
1551 1553  
1552 1554          return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
1553 1555  }
1554 1556  
1555 1557  static void
1556 1558  recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1557 1559  {
1558 1560          dsl_dataset_t *ds = arg1;
1559 1561          struct recvendsyncarg *resa = arg2;
1560 1562  
1561 1563          dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
1562 1564  
1563 1565          /* set snapshot's creation time and guid */
1564 1566          dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1565 1567          ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
1566 1568          ds->ds_prev->ds_phys->ds_guid = resa->toguid;
1567 1569          ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1568 1570  
1569 1571          dmu_buf_will_dirty(ds->ds_dbuf, tx);
1570 1572          ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1571 1573  }
1572 1574  
1573 1575  static int
1574 1576  add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
1575 1577  {
1576 1578          dsl_pool_t *dp = ds->ds_dir->dd_pool;
1577 1579          uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
1578 1580          dsl_dataset_t *snapds;
1579 1581          guid_map_entry_t *gmep;
1580 1582          int err;
1581 1583  
1582 1584          ASSERT(guid_map != NULL);
1583 1585  
1584 1586          rw_enter(&dp->dp_config_rwlock, RW_READER);
1585 1587          err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
1586 1588          if (err == 0) {
1587 1589                  gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
1588 1590                  gmep->guid = snapds->ds_phys->ds_guid;
1589 1591                  gmep->gme_ds = snapds;
1590 1592                  avl_add(guid_map, gmep);
1591 1593          }
1592 1594  
1593 1595          rw_exit(&dp->dp_config_rwlock);
1594 1596          return (err);
1595 1597  }
1596 1598  
1597 1599  static int
1598 1600  dmu_recv_existing_end(dmu_recv_cookie_t *drc)
1599 1601  {
1600 1602          struct recvendsyncarg resa;
1601 1603          dsl_dataset_t *ds = drc->drc_logical_ds;
1602 1604          int err, myerr;
1603 1605  
1604 1606          /*
1605 1607           * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1606 1608           * expects it to have a ds_user_ptr (and zil), but clone_swap()
1607 1609           * can close it.
1608 1610           */
1609 1611          txg_wait_synced(ds->ds_dir->dd_pool, 0);
1610 1612  
1611 1613          if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
1612 1614                  err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
1613 1615                      drc->drc_force);
1614 1616                  if (err)
1615 1617                          goto out;
1616 1618          } else {
1617 1619                  mutex_exit(&ds->ds_recvlock);
1618 1620                  dsl_dataset_rele(ds, dmu_recv_tag);
1619 1621                  (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1620 1622                      B_FALSE);
1621 1623                  return (EBUSY);
1622 1624          }
1623 1625  
1624 1626          resa.creation_time = drc->drc_drrb->drr_creation_time;
1625 1627          resa.toguid = drc->drc_drrb->drr_toguid;
1626 1628          resa.tosnap = drc->drc_tosnap;
1627 1629  
1628 1630          err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1629 1631              recv_end_check, recv_end_sync, ds, &resa, 3);
1630 1632          if (err) {
1631 1633                  /* swap back */
1632 1634                  (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
1633 1635          }
1634 1636  
1635 1637  out:
1636 1638          mutex_exit(&ds->ds_recvlock);
1637 1639          if (err == 0 && drc->drc_guid_to_ds_map != NULL)
1638 1640                  (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1639 1641          dsl_dataset_disown(ds, dmu_recv_tag);
1640 1642          myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
1641 1643          ASSERT3U(myerr, ==, 0);
1642 1644          return (err);
1643 1645  }
1644 1646  
1645 1647  static int
1646 1648  dmu_recv_new_end(dmu_recv_cookie_t *drc)
1647 1649  {
1648 1650          struct recvendsyncarg resa;
1649 1651          dsl_dataset_t *ds = drc->drc_logical_ds;
1650 1652          int err;
1651 1653  
1652 1654          /*
1653 1655           * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1654 1656           * expects it to have a ds_user_ptr (and zil), but clone_swap()
1655 1657           * can close it.
1656 1658           */
1657 1659          txg_wait_synced(ds->ds_dir->dd_pool, 0);
1658 1660  
1659 1661          resa.creation_time = drc->drc_drrb->drr_creation_time;
1660 1662          resa.toguid = drc->drc_drrb->drr_toguid;
1661 1663          resa.tosnap = drc->drc_tosnap;
1662 1664  
1663 1665          err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1664 1666              recv_end_check, recv_end_sync, ds, &resa, 3);
1665 1667          if (err) {
1666 1668                  /* clean up the fs we just recv'd into */
1667 1669                  (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
1668 1670          } else {
1669 1671                  if (drc->drc_guid_to_ds_map != NULL)
1670 1672                          (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1671 1673                  /* release the hold from dmu_recv_begin */
1672 1674                  dsl_dataset_disown(ds, dmu_recv_tag);
1673 1675          }
1674 1676          return (err);
1675 1677  }
1676 1678  
1677 1679  int
1678 1680  dmu_recv_end(dmu_recv_cookie_t *drc)
1679 1681  {
1680 1682          if (drc->drc_logical_ds != drc->drc_real_ds)
1681 1683                  return (dmu_recv_existing_end(drc));
1682 1684          else
1683 1685                  return (dmu_recv_new_end(drc));
1684 1686  }
  
    | 
      ↓ open down ↓ | 
    467 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX