Print this page
    
*** NO COMMENTS ***
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/dmu_send.c
          +++ new/usr/src/uts/common/fs/zfs/dmu_send.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  
    | 
      ↓ open down ↓ | 
    12 lines elided | 
    
      ↑ open up ↑ | 
  
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23      - * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
       23 + * Copyright 2012 Nexenta Systems, Inc. All rights reserved.
  24   24   * Copyright (c) 2012 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  26   26   */
  27   27  
  28   28  #include <sys/dmu.h>
  29   29  #include <sys/dmu_impl.h>
  30   30  #include <sys/dmu_tx.h>
  31   31  #include <sys/dbuf.h>
  32   32  #include <sys/dnode.h>
  33   33  #include <sys/zfs_context.h>
  34   34  #include <sys/dmu_objset.h>
  35   35  #include <sys/dmu_traverse.h>
  36   36  #include <sys/dsl_dataset.h>
  37   37  #include <sys/dsl_dir.h>
  38   38  #include <sys/dsl_prop.h>
  39   39  #include <sys/dsl_pool.h>
  40   40  #include <sys/dsl_synctask.h>
  41   41  #include <sys/zfs_ioctl.h>
  42   42  #include <sys/zap.h>
  43   43  #include <sys/zio_checksum.h>
  44   44  #include <sys/zfs_znode.h>
  45   45  #include <zfs_fletcher.h>
  46   46  #include <sys/avl.h>
  47   47  #include <sys/ddt.h>
  48   48  #include <sys/zfs_onexit.h>
  49   49  
  50   50  /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */
  51   51  int zfs_send_corrupt_data = B_FALSE;
  
    | 
      ↓ open down ↓ | 
    18 lines elided | 
    
      ↑ open up ↑ | 
  
  52   52  
  53   53  static char *dmu_recv_tag = "dmu_recv_tag";
  54   54  
  55   55  static int
  56   56  dump_bytes(dmu_sendarg_t *dsp, void *buf, int len)
  57   57  {
  58   58          dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset;
  59   59          ssize_t resid; /* have to get resid to get detailed errno */
  60   60          ASSERT3U(len % 8, ==, 0);
  61   61  
  62      -        fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
  63      -        dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
  64      -            (caddr_t)buf, len,
  65      -            0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY, CRED(), &resid);
  66      -
       62 +        dsp->dsa_err = 0;
       63 +        if (!dsp->sendsize) {
       64 +                fletcher_4_incremental_native(buf, len, &dsp->dsa_zc);
       65 +                dsp->dsa_err = vn_rdwr(UIO_WRITE, dsp->dsa_vp,
       66 +                    (caddr_t)buf, len,
       67 +                    0, UIO_SYSSPACE, FAPPEND, RLIM64_INFINITY,
       68 +                    CRED(), &resid);
       69 +        }
  67   70          mutex_enter(&ds->ds_sendstream_lock);
  68   71          *dsp->dsa_off += len;
  69   72          mutex_exit(&ds->ds_sendstream_lock);
  70   73  
  71   74          return (dsp->dsa_err);
  72   75  }
  73   76  
  74   77  static int
  75   78  dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset,
  76   79      uint64_t length)
  77   80  {
  78   81          struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free);
  79   82  
  80   83          if (length != -1ULL && offset + length < offset)
  81   84                  length = -1ULL;
  82   85  
  83   86          /*
  84   87           * If there is a pending op, but it's not PENDING_FREE, push it out,
  85   88           * since free block aggregation can only be done for blocks of the
  86   89           * same type (i.e., DRR_FREE records can only be aggregated with
  87   90           * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
  88   91           * aggregated with other DRR_FREEOBJECTS records.
  89   92           */
  90   93          if (dsp->dsa_pending_op != PENDING_NONE &&
  91   94              dsp->dsa_pending_op != PENDING_FREE) {
  92   95                  if (dump_bytes(dsp, dsp->dsa_drr,
  93   96                      sizeof (dmu_replay_record_t)) != 0)
  94   97                          return (EINTR);
  95   98                  dsp->dsa_pending_op = PENDING_NONE;
  96   99          }
  97  100  
  98  101          if (dsp->dsa_pending_op == PENDING_FREE) {
  99  102                  /*
 100  103                   * There should never be a PENDING_FREE if length is -1
 101  104                   * (because dump_dnode is the only place where this
 102  105                   * function is called with a -1, and only after flushing
 103  106                   * any pending record).
 104  107                   */
 105  108                  ASSERT(length != -1ULL);
 106  109                  /*
 107  110                   * Check to see whether this free block can be aggregated
 108  111                   * with pending one.
 109  112                   */
 110  113                  if (drrf->drr_object == object && drrf->drr_offset +
 111  114                      drrf->drr_length == offset) {
 112  115                          drrf->drr_length += length;
 113  116                          return (0);
 114  117                  } else {
 115  118                          /* not a continuation.  Push out pending record */
 116  119                          if (dump_bytes(dsp, dsp->dsa_drr,
 117  120                              sizeof (dmu_replay_record_t)) != 0)
 118  121                                  return (EINTR);
 119  122                          dsp->dsa_pending_op = PENDING_NONE;
 120  123                  }
 121  124          }
 122  125          /* create a FREE record and make it pending */
 123  126          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 124  127          dsp->dsa_drr->drr_type = DRR_FREE;
 125  128          drrf->drr_object = object;
 126  129          drrf->drr_offset = offset;
 127  130          drrf->drr_length = length;
 128  131          drrf->drr_toguid = dsp->dsa_toguid;
 129  132          if (length == -1ULL) {
 130  133                  if (dump_bytes(dsp, dsp->dsa_drr,
 131  134                      sizeof (dmu_replay_record_t)) != 0)
 132  135                          return (EINTR);
 133  136          } else {
 134  137                  dsp->dsa_pending_op = PENDING_FREE;
 135  138          }
 136  139  
 137  140          return (0);
 138  141  }
 139  142  
 140  143  static int
 141  144  dump_data(dmu_sendarg_t *dsp, dmu_object_type_t type,
 142  145      uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 143  146  {
 144  147          struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write);
 145  148  
 146  149  
 147  150          /*
 148  151           * If there is any kind of pending aggregation (currently either
 149  152           * a grouping of free objects or free blocks), push it out to
 150  153           * the stream, since aggregation can't be done across operations
 151  154           * of different types.
 152  155           */
 153  156          if (dsp->dsa_pending_op != PENDING_NONE) {
 154  157                  if (dump_bytes(dsp, dsp->dsa_drr,
 155  158                      sizeof (dmu_replay_record_t)) != 0)
 156  159                          return (EINTR);
 157  160                  dsp->dsa_pending_op = PENDING_NONE;
 158  161          }
 159  162          /* write a DATA record */
 160  163          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 161  164          dsp->dsa_drr->drr_type = DRR_WRITE;
 162  165          drrw->drr_object = object;
 163  166          drrw->drr_type = type;
 164  167          drrw->drr_offset = offset;
 165  168          drrw->drr_length = blksz;
 166  169          drrw->drr_toguid = dsp->dsa_toguid;
 167  170          drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
 168  171          if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
 169  172                  drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
 170  173          DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
 171  174          DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
 172  175          DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
 173  176          drrw->drr_key.ddk_cksum = bp->blk_cksum;
 174  177  
 175  178          if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 176  179                  return (EINTR);
 177  180          if (dump_bytes(dsp, data, blksz) != 0)
 178  181                  return (EINTR);
 179  182          return (0);
 180  183  }
 181  184  
 182  185  static int
 183  186  dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data)
 184  187  {
 185  188          struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill);
 186  189  
 187  190          if (dsp->dsa_pending_op != PENDING_NONE) {
 188  191                  if (dump_bytes(dsp, dsp->dsa_drr,
 189  192                      sizeof (dmu_replay_record_t)) != 0)
 190  193                          return (EINTR);
 191  194                  dsp->dsa_pending_op = PENDING_NONE;
 192  195          }
 193  196  
 194  197          /* write a SPILL record */
 195  198          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 196  199          dsp->dsa_drr->drr_type = DRR_SPILL;
 197  200          drrs->drr_object = object;
 198  201          drrs->drr_length = blksz;
 199  202          drrs->drr_toguid = dsp->dsa_toguid;
 200  203  
 201  204          if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)))
 202  205                  return (EINTR);
 203  206          if (dump_bytes(dsp, data, blksz))
 204  207                  return (EINTR);
 205  208          return (0);
 206  209  }
 207  210  
 208  211  static int
 209  212  dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs)
 210  213  {
 211  214          struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects);
 212  215  
 213  216          /*
 214  217           * If there is a pending op, but it's not PENDING_FREEOBJECTS,
 215  218           * push it out, since free block aggregation can only be done for
 216  219           * blocks of the same type (i.e., DRR_FREE records can only be
 217  220           * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
 218  221           * can only be aggregated with other DRR_FREEOBJECTS records.
 219  222           */
 220  223          if (dsp->dsa_pending_op != PENDING_NONE &&
 221  224              dsp->dsa_pending_op != PENDING_FREEOBJECTS) {
 222  225                  if (dump_bytes(dsp, dsp->dsa_drr,
 223  226                      sizeof (dmu_replay_record_t)) != 0)
 224  227                          return (EINTR);
 225  228                  dsp->dsa_pending_op = PENDING_NONE;
 226  229          }
 227  230          if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) {
 228  231                  /*
 229  232                   * See whether this free object array can be aggregated
 230  233                   * with pending one
 231  234                   */
 232  235                  if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
 233  236                          drrfo->drr_numobjs += numobjs;
 234  237                          return (0);
 235  238                  } else {
 236  239                          /* can't be aggregated.  Push out pending record */
 237  240                          if (dump_bytes(dsp, dsp->dsa_drr,
 238  241                              sizeof (dmu_replay_record_t)) != 0)
 239  242                                  return (EINTR);
 240  243                          dsp->dsa_pending_op = PENDING_NONE;
 241  244                  }
 242  245          }
 243  246  
 244  247          /* write a FREEOBJECTS record */
 245  248          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 246  249          dsp->dsa_drr->drr_type = DRR_FREEOBJECTS;
 247  250          drrfo->drr_firstobj = firstobj;
 248  251          drrfo->drr_numobjs = numobjs;
 249  252          drrfo->drr_toguid = dsp->dsa_toguid;
 250  253  
 251  254          dsp->dsa_pending_op = PENDING_FREEOBJECTS;
 252  255  
 253  256          return (0);
 254  257  }
 255  258  
 256  259  static int
 257  260  dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp)
 258  261  {
 259  262          struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object);
 260  263  
 261  264          if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 262  265                  return (dump_freeobjects(dsp, object, 1));
 263  266  
 264  267          if (dsp->dsa_pending_op != PENDING_NONE) {
 265  268                  if (dump_bytes(dsp, dsp->dsa_drr,
 266  269                      sizeof (dmu_replay_record_t)) != 0)
 267  270                          return (EINTR);
 268  271                  dsp->dsa_pending_op = PENDING_NONE;
 269  272          }
 270  273  
 271  274          /* write an OBJECT record */
 272  275          bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t));
 273  276          dsp->dsa_drr->drr_type = DRR_OBJECT;
 274  277          drro->drr_object = object;
 275  278          drro->drr_type = dnp->dn_type;
 276  279          drro->drr_bonustype = dnp->dn_bonustype;
 277  280          drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
 278  281          drro->drr_bonuslen = dnp->dn_bonuslen;
 279  282          drro->drr_checksumtype = dnp->dn_checksum;
 280  283          drro->drr_compress = dnp->dn_compress;
 281  284          drro->drr_toguid = dsp->dsa_toguid;
 282  285  
 283  286          if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0)
 284  287                  return (EINTR);
 285  288  
 286  289          if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
 287  290                  return (EINTR);
 288  291  
 289  292          /* free anything past the end of the file */
 290  293          if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) *
 291  294              (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL))
 292  295                  return (EINTR);
 293  296          if (dsp->dsa_err)
 294  297                  return (EINTR);
 295  298          return (0);
 296  299  }
 297  300  
 298  301  #define BP_SPAN(dnp, level) \
 299  302          (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 300  303          (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 301  304  
 302  305  /* ARGSUSED */
 303  306  static int
 304  307  backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, arc_buf_t *pbuf,
 305  308      const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 306  309  {
 307  310          dmu_sendarg_t *dsp = arg;
 308  311          dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
 309  312          int err = 0;
 310  313  
 311  314          if (issig(JUSTLOOKING) && issig(FORREAL))
 312  315                  return (EINTR);
 313  316  
 314  317          if (zb->zb_object != DMU_META_DNODE_OBJECT &&
 315  318              DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
 316  319                  return (0);
 317  320          } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
 318  321                  uint64_t span = BP_SPAN(dnp, zb->zb_level);
 319  322                  uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 320  323                  err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT);
 321  324          } else if (bp == NULL) {
 322  325                  uint64_t span = BP_SPAN(dnp, zb->zb_level);
 323  326                  err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span);
 324  327          } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) {
 325  328                  return (0);
 326  329          } else if (type == DMU_OT_DNODE) {
 327  330                  dnode_phys_t *blk;
 328  331                  int i;
 329  332                  int blksz = BP_GET_LSIZE(bp);
 330  333                  uint32_t aflags = ARC_WAIT;
 331  334                  arc_buf_t *abuf;
 332  335  
 333  336                  if (dsl_read(NULL, spa, bp, pbuf,
 334  337                      arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 335  338                      ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 336  339                          return (EIO);
 337  340  
 338  341                  blk = abuf->b_data;
 339  342                  for (i = 0; i < blksz >> DNODE_SHIFT; i++) {
 340  343                          uint64_t dnobj = (zb->zb_blkid <<
 341  344                              (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i;
 342  345                          err = dump_dnode(dsp, dnobj, blk+i);
 343  346                          if (err)
 344  347                                  break;
 345  348                  }
 346  349                  (void) arc_buf_remove_ref(abuf, &abuf);
 347  350          } else if (type == DMU_OT_SA) {
 348  351                  uint32_t aflags = ARC_WAIT;
 349  352                  arc_buf_t *abuf;
 350  353                  int blksz = BP_GET_LSIZE(bp);
  
    | 
      ↓ open down ↓ | 
    274 lines elided | 
    
      ↑ open up ↑ | 
  
 351  354  
 352  355                  if (arc_read_nolock(NULL, spa, bp,
 353  356                      arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 354  357                      ZIO_FLAG_CANFAIL, &aflags, zb) != 0)
 355  358                          return (EIO);
 356  359  
 357  360                  err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data);
 358  361                  (void) arc_buf_remove_ref(abuf, &abuf);
 359  362          } else { /* it's a level-0 block of a regular object */
 360  363                  uint32_t aflags = ARC_WAIT;
 361      -                arc_buf_t *abuf;
      364 +                arc_buf_t *abuf = NULL;
      365 +                void *buf = NULL;
 362  366                  int blksz = BP_GET_LSIZE(bp);
 363  367  
 364      -                if (dsl_read(NULL, spa, bp, pbuf,
 365      -                    arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
 366      -                    ZIO_FLAG_CANFAIL, &aflags, zb) != 0) {
 367      -                        if (zfs_send_corrupt_data) {
      368 +                if (!dsp->sendsize) {
      369 +                        if (dsl_read(NULL, spa, bp, pbuf,
      370 +                            arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ,
      371 +                            ZIO_FLAG_CANFAIL, &aflags, zb) != 0) {
      372 +                                if (zfs_send_corrupt_data) {
 368  373                                  /* Send a block filled with 0x"zfs badd bloc" */
 369      -                                abuf = arc_buf_alloc(spa, blksz, &abuf,
 370      -                                    ARC_BUFC_DATA);
 371      -                                uint64_t *ptr;
 372      -                                for (ptr = abuf->b_data;
 373      -                                    (char *)ptr < (char *)abuf->b_data + blksz;
 374      -                                    ptr++)
 375      -                                        *ptr = 0x2f5baddb10c;
 376      -                        } else {
 377      -                                return (EIO);
      374 +                                        abuf = arc_buf_alloc(spa, blksz, &abuf,
      375 +                                            ARC_BUFC_DATA);
      376 +                                        uint64_t *ptr;
      377 +                                        for (ptr = abuf->b_data;
      378 +                                            (char *)ptr <
      379 +                                            (char *)abuf->b_data + blksz;
      380 +                                            ptr++)
      381 +                                                *ptr = 0x2f5baddb10c;
      382 +                                } else {
      383 +                                        return (EIO);
      384 +                                }
 378  385                          }
      386 +                        buf = abuf->b_data;
 379  387                  }
 380  388  
 381  389                  err = dump_data(dsp, type, zb->zb_object, zb->zb_blkid * blksz,
 382      -                    blksz, bp, abuf->b_data);
 383      -                (void) arc_buf_remove_ref(abuf, &abuf);
      390 +                    blksz, bp, buf);
      391 +                if (!dsp->sendsize) {
      392 +                        (void) arc_buf_remove_ref(abuf, &abuf);
      393 +                }
 384  394          }
 385  395  
 386  396          ASSERT(err == 0 || err == EINTR);
 387  397          return (err);
 388  398  }
 389  399  
 390  400  /*
 391  401   * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline.
 392  402   * For example, they could both be snapshots of the same filesystem, and
 393  403   * 'earlier' is before 'later'.  Or 'earlier' could be the origin of
 394  404   * 'later's filesystem.  Or 'earlier' could be an older snapshot in the origin's
 395  405   * filesystem.  Or 'earlier' could be the origin's origin.
 396  406   */
 397  407  static boolean_t
 398  408  is_before(dsl_dataset_t *later, dsl_dataset_t *earlier)
 399  409  {
 400  410          dsl_pool_t *dp = later->ds_dir->dd_pool;
 401  411          int error;
 402  412          boolean_t ret;
 403  413          dsl_dataset_t *origin;
 404  414  
 405  415          if (earlier->ds_phys->ds_creation_txg >=
 406  416              later->ds_phys->ds_creation_txg)
 407  417                  return (B_FALSE);
 408  418  
 409  419          if (later->ds_dir == earlier->ds_dir)
 410  420                  return (B_TRUE);
 411  421          if (!dsl_dir_is_clone(later->ds_dir))
 412  422                  return (B_FALSE);
 413  423  
 414  424          rw_enter(&dp->dp_config_rwlock, RW_READER);
 415  425          if (later->ds_dir->dd_phys->dd_origin_obj == earlier->ds_object) {
 416  426                  rw_exit(&dp->dp_config_rwlock);
 417  427                  return (B_TRUE);
 418  428          }
  
    | 
      ↓ open down ↓ | 
    25 lines elided | 
    
      ↑ open up ↑ | 
  
 419  429          error = dsl_dataset_hold_obj(dp,
 420  430              later->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin);
 421  431          rw_exit(&dp->dp_config_rwlock);
 422  432          if (error != 0)
 423  433                  return (B_FALSE);
 424  434          ret = is_before(origin, earlier);
 425  435          dsl_dataset_rele(origin, FTAG);
 426  436          return (ret);
 427  437  }
 428  438  
      439 +
 429  440  int
 430  441  dmu_send(objset_t *tosnap, objset_t *fromsnap, int outfd, vnode_t *vp,
 431      -    offset_t *off)
      442 +    offset_t *off, boolean_t sendsize)
 432  443  {
 433  444          dsl_dataset_t *ds = tosnap->os_dsl_dataset;
 434  445          dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 435  446          dmu_replay_record_t *drr;
 436  447          dmu_sendarg_t *dsp;
 437  448          int err;
 438  449          uint64_t fromtxg = 0;
 439  450  
 440  451          /* tosnap must be a snapshot */
 441  452          if (ds->ds_phys->ds_next_snap_obj == 0)
 442  453                  return (EINVAL);
 443  454  
 444  455          /*
 445  456           * fromsnap must be an earlier snapshot from the same fs as tosnap,
 446  457           * or the origin's fs.
 447  458           */
 448  459          if (fromds != NULL && !is_before(ds, fromds))
 449  460                  return (EXDEV);
 450  461  
 451  462          drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 452  463          drr->drr_type = DRR_BEGIN;
 453  464          drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
 454  465          DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
 455  466              DMU_SUBSTREAM);
 456  467  
 457  468  #ifdef _KERNEL
 458  469          if (dmu_objset_type(tosnap) == DMU_OST_ZFS) {
 459  470                  uint64_t version;
 460  471                  if (zfs_get_zplprop(tosnap, ZFS_PROP_VERSION, &version) != 0) {
 461  472                          kmem_free(drr, sizeof (dmu_replay_record_t));
 462  473                          return (EINVAL);
 463  474                  }
 464  475                  if (version == ZPL_VERSION_SA) {
 465  476                          DMU_SET_FEATUREFLAGS(
 466  477                              drr->drr_u.drr_begin.drr_versioninfo,
 467  478                              DMU_BACKUP_FEATURE_SA_SPILL);
 468  479                  }
 469  480          }
 470  481  #endif
 471  482  
 472  483          drr->drr_u.drr_begin.drr_creation_time =
 473  484              ds->ds_phys->ds_creation_time;
 474  485          drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
 475  486          if (fromds != NULL && ds->ds_dir != fromds->ds_dir)
 476  487                  drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 477  488          drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
 478  489          if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 479  490                  drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA;
 480  491  
 481  492          if (fromds)
 482  493                  drr->drr_u.drr_begin.drr_fromguid = fromds->ds_phys->ds_guid;
 483  494          dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname);
 484  495  
 485  496          if (fromds)
 486  497                  fromtxg = fromds->ds_phys->ds_creation_txg;
 487  498  
 488  499          dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP);
  
    | 
      ↓ open down ↓ | 
    47 lines elided | 
    
      ↑ open up ↑ | 
  
 489  500  
 490  501          dsp->dsa_drr = drr;
 491  502          dsp->dsa_vp = vp;
 492  503          dsp->dsa_outfd = outfd;
 493  504          dsp->dsa_proc = curproc;
 494  505          dsp->dsa_os = tosnap;
 495  506          dsp->dsa_off = off;
 496  507          dsp->dsa_toguid = ds->ds_phys->ds_guid;
 497  508          ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0);
 498  509          dsp->dsa_pending_op = PENDING_NONE;
      510 +        dsp->sendsize = sendsize;
 499  511  
 500  512          mutex_enter(&ds->ds_sendstream_lock);
 501  513          list_insert_head(&ds->ds_sendstreams, dsp);
 502  514          mutex_exit(&ds->ds_sendstream_lock);
 503  515  
 504  516          if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 505  517                  err = dsp->dsa_err;
 506  518                  goto out;
 507  519          }
 508  520  
 509      -        err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
 510      -            backup_cb, dsp);
      521 +        if (dsp->sendsize) {
      522 +                err = traverse_dataset(ds, fromtxg,
      523 +                    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
      524 +                    backup_cb, dsp);
      525 +        } else {
      526 +                err = traverse_dataset(ds,
      527 +                    fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
      528 +                    backup_cb, dsp);
      529 +        }
 511  530  
 512  531          if (dsp->dsa_pending_op != PENDING_NONE)
 513  532                  if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0)
 514  533                          err = EINTR;
 515  534  
 516  535          if (err) {
 517  536                  if (err == EINTR && dsp->dsa_err)
 518  537                          err = dsp->dsa_err;
 519  538                  goto out;
 520  539          }
 521  540  
 522  541          bzero(drr, sizeof (dmu_replay_record_t));
 523  542          drr->drr_type = DRR_END;
 524  543          drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc;
 525  544          drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid;
 526  545  
 527  546          if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) {
 528  547                  err = dsp->dsa_err;
 529  548                  goto out;
 530  549          }
 531  550  
 532  551  out:
 533  552          mutex_enter(&ds->ds_sendstream_lock);
 534  553          list_remove(&ds->ds_sendstreams, dsp);
 535  554          mutex_exit(&ds->ds_sendstream_lock);
 536  555  
 537  556          kmem_free(drr, sizeof (dmu_replay_record_t));
 538  557          kmem_free(dsp, sizeof (dmu_sendarg_t));
 539  558  
 540  559          return (err);
 541  560  }
 542  561  
 543  562  int
 544  563  dmu_send_estimate(objset_t *tosnap, objset_t *fromsnap, uint64_t *sizep)
 545  564  {
 546  565          dsl_dataset_t *ds = tosnap->os_dsl_dataset;
 547  566          dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 548  567          dsl_pool_t *dp = ds->ds_dir->dd_pool;
 549  568          int err;
 550  569          uint64_t size;
 551  570  
 552  571          /* tosnap must be a snapshot */
 553  572          if (ds->ds_phys->ds_next_snap_obj == 0)
 554  573                  return (EINVAL);
 555  574  
 556  575          /*
 557  576           * fromsnap must be an earlier snapshot from the same fs as tosnap,
 558  577           * or the origin's fs.
 559  578           */
 560  579          if (fromds != NULL && !is_before(ds, fromds))
 561  580                  return (EXDEV);
 562  581  
 563  582          /* Get uncompressed size estimate of changed data. */
 564  583          if (fromds == NULL) {
 565  584                  size = ds->ds_phys->ds_uncompressed_bytes;
 566  585          } else {
 567  586                  uint64_t used, comp;
 568  587                  err = dsl_dataset_space_written(fromds, ds,
 569  588                      &used, &comp, &size);
 570  589                  if (err)
 571  590                          return (err);
 572  591          }
 573  592  
 574  593          /*
 575  594           * Assume that space (both on-disk and in-stream) is dominated by
 576  595           * data.  We will adjust for indirect blocks and the copies property,
 577  596           * but ignore per-object space used (eg, dnodes and DRR_OBJECT records).
 578  597           */
 579  598  
 580  599          /*
 581  600           * Subtract out approximate space used by indirect blocks.
 582  601           * Assume most space is used by data blocks (non-indirect, non-dnode).
 583  602           * Assume all blocks are recordsize.  Assume ditto blocks and
 584  603           * internal fragmentation counter out compression.
 585  604           *
 586  605           * Therefore, space used by indirect blocks is sizeof(blkptr_t) per
 587  606           * block, which we observe in practice.
 588  607           */
 589  608          uint64_t recordsize;
 590  609          rw_enter(&dp->dp_config_rwlock, RW_READER);
 591  610          err = dsl_prop_get_ds(ds, "recordsize",
 592  611              sizeof (recordsize), 1, &recordsize, NULL);
 593  612          rw_exit(&dp->dp_config_rwlock);
 594  613          if (err)
 595  614                  return (err);
 596  615          size -= size / recordsize * sizeof (blkptr_t);
 597  616  
 598  617          /* Add in the space for the record associated with each block. */
 599  618          size += size / recordsize * sizeof (dmu_replay_record_t);
 600  619  
 601  620          *sizep = size;
 602  621  
 603  622          return (0);
 604  623  }
 605  624  
 606  625  struct recvbeginsyncarg {
 607  626          const char *tofs;
 608  627          const char *tosnap;
 609  628          dsl_dataset_t *origin;
 610  629          uint64_t fromguid;
 611  630          dmu_objset_type_t type;
 612  631          void *tag;
 613  632          boolean_t force;
 614  633          uint64_t dsflags;
 615  634          char clonelastname[MAXNAMELEN];
 616  635          dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
 617  636          cred_t *cr;
 618  637  };
 619  638  
 620  639  /* ARGSUSED */
 621  640  static int
 622  641  recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 623  642  {
 624  643          dsl_dir_t *dd = arg1;
 625  644          struct recvbeginsyncarg *rbsa = arg2;
 626  645          objset_t *mos = dd->dd_pool->dp_meta_objset;
 627  646          uint64_t val;
 628  647          int err;
 629  648  
 630  649          err = zap_lookup(mos, dd->dd_phys->dd_child_dir_zapobj,
 631  650              strrchr(rbsa->tofs, '/') + 1, sizeof (uint64_t), 1, &val);
 632  651  
 633  652          if (err != ENOENT)
 634  653                  return (err ? err : EEXIST);
 635  654  
 636  655          if (rbsa->origin) {
 637  656                  /* make sure it's a snap in the same pool */
 638  657                  if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
 639  658                          return (EXDEV);
 640  659                  if (!dsl_dataset_is_snapshot(rbsa->origin))
 641  660                          return (EINVAL);
 642  661                  if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
 643  662                          return (ENODEV);
 644  663          }
 645  664  
 646  665          return (0);
 647  666  }
 648  667  
 649  668  static void
 650  669  recv_new_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 651  670  {
 652  671          dsl_dir_t *dd = arg1;
 653  672          struct recvbeginsyncarg *rbsa = arg2;
 654  673          uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 655  674          uint64_t dsobj;
 656  675  
 657  676          /* Create and open new dataset. */
 658  677          dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
 659  678              rbsa->origin, flags, rbsa->cr, tx);
 660  679          VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
 661  680              B_TRUE, dmu_recv_tag, &rbsa->ds));
 662  681  
 663  682          if (rbsa->origin == NULL) {
 664  683                  (void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
 665  684                      rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
 666  685          }
 667  686  
 668  687          spa_history_log_internal_ds(rbsa->ds, "receive new", tx, "");
 669  688  }
 670  689  
 671  690  /* ARGSUSED */
 672  691  static int
 673  692  recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 674  693  {
 675  694          dsl_dataset_t *ds = arg1;
 676  695          struct recvbeginsyncarg *rbsa = arg2;
 677  696          int err;
 678  697          uint64_t val;
 679  698  
 680  699          /* must not have any changes since most recent snapshot */
 681  700          if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
 682  701                  return (ETXTBSY);
 683  702  
 684  703          /* new snapshot name must not exist */
 685  704          err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 686  705              ds->ds_phys->ds_snapnames_zapobj, rbsa->tosnap, 8, 1, &val);
 687  706          if (err == 0)
 688  707                  return (EEXIST);
 689  708          if (err != ENOENT)
 690  709                  return (err);
 691  710  
 692  711          if (rbsa->fromguid) {
 693  712                  /* if incremental, most recent snapshot must match fromguid */
 694  713                  if (ds->ds_prev == NULL)
 695  714                          return (ENODEV);
 696  715  
 697  716                  /*
 698  717                   * most recent snapshot must match fromguid, or there are no
 699  718                   * changes since the fromguid one
 700  719                   */
 701  720                  if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
 702  721                          uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
 703  722                          uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
 704  723                          while (obj != 0) {
 705  724                                  dsl_dataset_t *snap;
 706  725                                  err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
 707  726                                      obj, FTAG, &snap);
 708  727                                  if (err)
 709  728                                          return (ENODEV);
 710  729                                  if (snap->ds_phys->ds_creation_txg < birth) {
 711  730                                          dsl_dataset_rele(snap, FTAG);
 712  731                                          return (ENODEV);
 713  732                                  }
 714  733                                  if (snap->ds_phys->ds_guid == rbsa->fromguid) {
 715  734                                          dsl_dataset_rele(snap, FTAG);
 716  735                                          break; /* it's ok */
 717  736                                  }
 718  737                                  obj = snap->ds_phys->ds_prev_snap_obj;
 719  738                                  dsl_dataset_rele(snap, FTAG);
 720  739                          }
 721  740                          if (obj == 0)
 722  741                                  return (ENODEV);
 723  742                  }
 724  743          } else {
 725  744                  /* if full, most recent snapshot must be $ORIGIN */
 726  745                  if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
 727  746                          return (ENODEV);
 728  747          }
 729  748  
 730  749          /* temporary clone name must not exist */
 731  750          err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
 732  751              ds->ds_dir->dd_phys->dd_child_dir_zapobj,
 733  752              rbsa->clonelastname, 8, 1, &val);
 734  753          if (err == 0)
 735  754                  return (EEXIST);
 736  755          if (err != ENOENT)
 737  756                  return (err);
 738  757  
 739  758          return (0);
 740  759  }
 741  760  
 742  761  /* ARGSUSED */
 743  762  static void
 744  763  recv_existing_sync(void *arg1, void *arg2, dmu_tx_t *tx)
 745  764  {
 746  765          dsl_dataset_t *ohds = arg1;
 747  766          struct recvbeginsyncarg *rbsa = arg2;
 748  767          dsl_pool_t *dp = ohds->ds_dir->dd_pool;
 749  768          dsl_dataset_t *cds;
 750  769          uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 751  770          uint64_t dsobj;
 752  771  
 753  772          /* create and open the temporary clone */
 754  773          dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
 755  774              ohds->ds_prev, flags, rbsa->cr, tx);
 756  775          VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
 757  776  
 758  777          /*
 759  778           * If we actually created a non-clone, we need to create the
 760  779           * objset in our new dataset.
 761  780           */
 762  781          if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
 763  782                  (void) dmu_objset_create_impl(dp->dp_spa,
 764  783                      cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
 765  784          }
 766  785  
 767  786          rbsa->ds = cds;
 768  787  
 769  788          spa_history_log_internal_ds(cds, "receive over existing", tx, "");
 770  789  }
 771  790  
 772  791  static boolean_t
 773  792  dmu_recv_verify_features(dsl_dataset_t *ds, struct drr_begin *drrb)
 774  793  {
 775  794          int featureflags;
 776  795  
 777  796          featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
 778  797  
 779  798          /* Verify pool version supports SA if SA_SPILL feature set */
 780  799          return ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) &&
 781  800              (spa_version(dsl_dataset_get_spa(ds)) < SPA_VERSION_SA));
 782  801  }
 783  802  
 784  803  /*
 785  804   * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
 786  805   * succeeds; otherwise we will leak the holds on the datasets.
 787  806   */
 788  807  int
 789  808  dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
 790  809      boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
 791  810  {
 792  811          int err = 0;
 793  812          boolean_t byteswap;
 794  813          struct recvbeginsyncarg rbsa = { 0 };
 795  814          uint64_t versioninfo;
 796  815          int flags;
 797  816          dsl_dataset_t *ds;
 798  817  
 799  818          if (drrb->drr_magic == DMU_BACKUP_MAGIC)
 800  819                  byteswap = FALSE;
 801  820          else if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
 802  821                  byteswap = TRUE;
 803  822          else
 804  823                  return (EINVAL);
 805  824  
 806  825          rbsa.tofs = tofs;
 807  826          rbsa.tosnap = tosnap;
 808  827          rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
 809  828          rbsa.fromguid = drrb->drr_fromguid;
 810  829          rbsa.type = drrb->drr_type;
 811  830          rbsa.tag = FTAG;
 812  831          rbsa.dsflags = 0;
 813  832          rbsa.cr = CRED();
 814  833          versioninfo = drrb->drr_versioninfo;
 815  834          flags = drrb->drr_flags;
 816  835  
 817  836          if (byteswap) {
 818  837                  rbsa.type = BSWAP_32(rbsa.type);
 819  838                  rbsa.fromguid = BSWAP_64(rbsa.fromguid);
 820  839                  versioninfo = BSWAP_64(versioninfo);
 821  840                  flags = BSWAP_32(flags);
 822  841          }
 823  842  
 824  843          if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
 825  844              rbsa.type >= DMU_OST_NUMTYPES ||
 826  845              ((flags & DRR_FLAG_CLONE) && origin == NULL))
 827  846                  return (EINVAL);
 828  847  
 829  848          if (flags & DRR_FLAG_CI_DATA)
 830  849                  rbsa.dsflags = DS_FLAG_CI_DATASET;
 831  850  
 832  851          bzero(drc, sizeof (dmu_recv_cookie_t));
 833  852          drc->drc_drrb = drrb;
 834  853          drc->drc_tosnap = tosnap;
 835  854          drc->drc_top_ds = top_ds;
 836  855          drc->drc_force = force;
 837  856  
 838  857          /*
 839  858           * Process the begin in syncing context.
 840  859           */
 841  860  
 842  861          /* open the dataset we are logically receiving into */
 843  862          err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
 844  863          if (err == 0) {
 845  864                  if (dmu_recv_verify_features(ds, drrb)) {
 846  865                          dsl_dataset_rele(ds, dmu_recv_tag);
 847  866                          return (ENOTSUP);
 848  867                  }
 849  868                  /* target fs already exists; recv into temp clone */
 850  869  
 851  870                  /* Can't recv a clone into an existing fs */
 852  871                  if (flags & DRR_FLAG_CLONE) {
 853  872                          dsl_dataset_rele(ds, dmu_recv_tag);
 854  873                          return (EINVAL);
 855  874                  }
 856  875  
 857  876                  /* must not have an incremental recv already in progress */
 858  877                  if (!mutex_tryenter(&ds->ds_recvlock)) {
 859  878                          dsl_dataset_rele(ds, dmu_recv_tag);
 860  879                          return (EBUSY);
 861  880                  }
 862  881  
 863  882                  /* tmp clone name is: tofs/%tosnap" */
 864  883                  (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
 865  884                      "%%%s", tosnap);
 866  885                  rbsa.force = force;
 867  886                  err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 868  887                      recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
 869  888                  if (err) {
 870  889                          mutex_exit(&ds->ds_recvlock);
 871  890                          dsl_dataset_rele(ds, dmu_recv_tag);
 872  891                          return (err);
 873  892                  }
 874  893                  drc->drc_logical_ds = ds;
 875  894                  drc->drc_real_ds = rbsa.ds;
 876  895          } else if (err == ENOENT) {
 877  896                  /* target fs does not exist; must be a full backup or clone */
 878  897                  char *cp;
 879  898  
 880  899                  /*
 881  900                   * If it's a non-clone incremental, we are missing the
 882  901                   * target fs, so fail the recv.
 883  902                   */
 884  903                  if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
 885  904                          return (ENOENT);
 886  905  
 887  906                  /* Open the parent of tofs */
 888  907                  cp = strrchr(tofs, '/');
 889  908                  *cp = '\0';
 890  909                  err = dsl_dataset_hold(tofs, FTAG, &ds);
 891  910                  *cp = '/';
 892  911                  if (err)
 893  912                          return (err);
 894  913  
 895  914                  if (dmu_recv_verify_features(ds, drrb)) {
 896  915                          dsl_dataset_rele(ds, FTAG);
 897  916                          return (ENOTSUP);
 898  917                  }
 899  918  
 900  919                  err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 901  920                      recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
 902  921                  dsl_dataset_rele(ds, FTAG);
 903  922                  if (err)
 904  923                          return (err);
 905  924                  drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
 906  925                  drc->drc_newfs = B_TRUE;
 907  926          }
 908  927  
 909  928          return (err);
 910  929  }
 911  930  
 912  931  struct restorearg {
 913  932          int err;
 914  933          int byteswap;
 915  934          vnode_t *vp;
 916  935          char *buf;
 917  936          uint64_t voff;
 918  937          int bufsize; /* amount of memory allocated for buf */
 919  938          zio_cksum_t cksum;
 920  939          avl_tree_t *guid_to_ds_map;
 921  940  };
 922  941  
 923  942  typedef struct guid_map_entry {
 924  943          uint64_t        guid;
 925  944          dsl_dataset_t   *gme_ds;
 926  945          avl_node_t      avlnode;
 927  946  } guid_map_entry_t;
 928  947  
 929  948  static int
 930  949  guid_compare(const void *arg1, const void *arg2)
 931  950  {
 932  951          const guid_map_entry_t *gmep1 = arg1;
 933  952          const guid_map_entry_t *gmep2 = arg2;
 934  953  
 935  954          if (gmep1->guid < gmep2->guid)
 936  955                  return (-1);
 937  956          else if (gmep1->guid > gmep2->guid)
 938  957                  return (1);
 939  958          return (0);
 940  959  }
 941  960  
 942  961  static void
 943  962  free_guid_map_onexit(void *arg)
 944  963  {
 945  964          avl_tree_t *ca = arg;
 946  965          void *cookie = NULL;
 947  966          guid_map_entry_t *gmep;
 948  967  
 949  968          while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) {
 950  969                  dsl_dataset_rele(gmep->gme_ds, ca);
 951  970                  kmem_free(gmep, sizeof (guid_map_entry_t));
 952  971          }
 953  972          avl_destroy(ca);
 954  973          kmem_free(ca, sizeof (avl_tree_t));
 955  974  }
 956  975  
 957  976  static void *
 958  977  restore_read(struct restorearg *ra, int len)
 959  978  {
 960  979          void *rv;
 961  980          int done = 0;
 962  981  
 963  982          /* some things will require 8-byte alignment, so everything must */
 964  983          ASSERT3U(len % 8, ==, 0);
 965  984  
 966  985          while (done < len) {
 967  986                  ssize_t resid;
 968  987  
 969  988                  ra->err = vn_rdwr(UIO_READ, ra->vp,
 970  989                      (caddr_t)ra->buf + done, len - done,
 971  990                      ra->voff, UIO_SYSSPACE, FAPPEND,
 972  991                      RLIM64_INFINITY, CRED(), &resid);
 973  992  
 974  993                  if (resid == len - done)
 975  994                          ra->err = EINVAL;
 976  995                  ra->voff += len - done - resid;
 977  996                  done = len - resid;
 978  997                  if (ra->err)
 979  998                          return (NULL);
 980  999          }
 981 1000  
 982 1001          ASSERT3U(done, ==, len);
 983 1002          rv = ra->buf;
 984 1003          if (ra->byteswap)
 985 1004                  fletcher_4_incremental_byteswap(rv, len, &ra->cksum);
 986 1005          else
 987 1006                  fletcher_4_incremental_native(rv, len, &ra->cksum);
 988 1007          return (rv);
 989 1008  }
 990 1009  
 991 1010  static void
 992 1011  backup_byteswap(dmu_replay_record_t *drr)
 993 1012  {
 994 1013  #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X))
 995 1014  #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X))
 996 1015          drr->drr_type = BSWAP_32(drr->drr_type);
 997 1016          drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen);
 998 1017          switch (drr->drr_type) {
 999 1018          case DRR_BEGIN:
1000 1019                  DO64(drr_begin.drr_magic);
1001 1020                  DO64(drr_begin.drr_versioninfo);
1002 1021                  DO64(drr_begin.drr_creation_time);
1003 1022                  DO32(drr_begin.drr_type);
1004 1023                  DO32(drr_begin.drr_flags);
1005 1024                  DO64(drr_begin.drr_toguid);
1006 1025                  DO64(drr_begin.drr_fromguid);
1007 1026                  break;
1008 1027          case DRR_OBJECT:
1009 1028                  DO64(drr_object.drr_object);
1010 1029                  /* DO64(drr_object.drr_allocation_txg); */
1011 1030                  DO32(drr_object.drr_type);
1012 1031                  DO32(drr_object.drr_bonustype);
1013 1032                  DO32(drr_object.drr_blksz);
1014 1033                  DO32(drr_object.drr_bonuslen);
1015 1034                  DO64(drr_object.drr_toguid);
1016 1035                  break;
1017 1036          case DRR_FREEOBJECTS:
1018 1037                  DO64(drr_freeobjects.drr_firstobj);
1019 1038                  DO64(drr_freeobjects.drr_numobjs);
1020 1039                  DO64(drr_freeobjects.drr_toguid);
1021 1040                  break;
1022 1041          case DRR_WRITE:
1023 1042                  DO64(drr_write.drr_object);
1024 1043                  DO32(drr_write.drr_type);
1025 1044                  DO64(drr_write.drr_offset);
1026 1045                  DO64(drr_write.drr_length);
1027 1046                  DO64(drr_write.drr_toguid);
1028 1047                  DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
1029 1048                  DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
1030 1049                  DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
1031 1050                  DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
1032 1051                  DO64(drr_write.drr_key.ddk_prop);
1033 1052                  break;
1034 1053          case DRR_WRITE_BYREF:
1035 1054                  DO64(drr_write_byref.drr_object);
1036 1055                  DO64(drr_write_byref.drr_offset);
1037 1056                  DO64(drr_write_byref.drr_length);
1038 1057                  DO64(drr_write_byref.drr_toguid);
1039 1058                  DO64(drr_write_byref.drr_refguid);
1040 1059                  DO64(drr_write_byref.drr_refobject);
1041 1060                  DO64(drr_write_byref.drr_refoffset);
1042 1061                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
1043 1062                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
1044 1063                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
1045 1064                  DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
1046 1065                  DO64(drr_write_byref.drr_key.ddk_prop);
1047 1066                  break;
1048 1067          case DRR_FREE:
1049 1068                  DO64(drr_free.drr_object);
1050 1069                  DO64(drr_free.drr_offset);
1051 1070                  DO64(drr_free.drr_length);
1052 1071                  DO64(drr_free.drr_toguid);
1053 1072                  break;
1054 1073          case DRR_SPILL:
1055 1074                  DO64(drr_spill.drr_object);
1056 1075                  DO64(drr_spill.drr_length);
1057 1076                  DO64(drr_spill.drr_toguid);
1058 1077                  break;
1059 1078          case DRR_END:
1060 1079                  DO64(drr_end.drr_checksum.zc_word[0]);
1061 1080                  DO64(drr_end.drr_checksum.zc_word[1]);
1062 1081                  DO64(drr_end.drr_checksum.zc_word[2]);
1063 1082                  DO64(drr_end.drr_checksum.zc_word[3]);
1064 1083                  DO64(drr_end.drr_toguid);
1065 1084                  break;
1066 1085          }
1067 1086  #undef DO64
1068 1087  #undef DO32
1069 1088  }
1070 1089  
1071 1090  static int
1072 1091  restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
1073 1092  {
1074 1093          int err;
1075 1094          dmu_tx_t *tx;
1076 1095          void *data = NULL;
1077 1096  
1078 1097          if (drro->drr_type == DMU_OT_NONE ||
1079 1098              !DMU_OT_IS_VALID(drro->drr_type) ||
1080 1099              !DMU_OT_IS_VALID(drro->drr_bonustype) ||
1081 1100              drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
1082 1101              drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
1083 1102              P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
1084 1103              drro->drr_blksz < SPA_MINBLOCKSIZE ||
1085 1104              drro->drr_blksz > SPA_MAXBLOCKSIZE ||
1086 1105              drro->drr_bonuslen > DN_MAX_BONUSLEN) {
1087 1106                  return (EINVAL);
1088 1107          }
1089 1108  
1090 1109          err = dmu_object_info(os, drro->drr_object, NULL);
1091 1110  
1092 1111          if (err != 0 && err != ENOENT)
1093 1112                  return (EINVAL);
1094 1113  
1095 1114          if (drro->drr_bonuslen) {
1096 1115                  data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
1097 1116                  if (ra->err)
1098 1117                          return (ra->err);
1099 1118          }
1100 1119  
1101 1120          if (err == ENOENT) {
1102 1121                  /* currently free, want to be allocated */
1103 1122                  tx = dmu_tx_create(os);
1104 1123                  dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1105 1124                  err = dmu_tx_assign(tx, TXG_WAIT);
1106 1125                  if (err) {
1107 1126                          dmu_tx_abort(tx);
1108 1127                          return (err);
1109 1128                  }
1110 1129                  err = dmu_object_claim(os, drro->drr_object,
1111 1130                      drro->drr_type, drro->drr_blksz,
1112 1131                      drro->drr_bonustype, drro->drr_bonuslen, tx);
1113 1132                  dmu_tx_commit(tx);
1114 1133          } else {
1115 1134                  /* currently allocated, want to be allocated */
1116 1135                  err = dmu_object_reclaim(os, drro->drr_object,
1117 1136                      drro->drr_type, drro->drr_blksz,
1118 1137                      drro->drr_bonustype, drro->drr_bonuslen);
1119 1138          }
1120 1139          if (err) {
1121 1140                  return (EINVAL);
1122 1141          }
1123 1142  
1124 1143          tx = dmu_tx_create(os);
1125 1144          dmu_tx_hold_bonus(tx, drro->drr_object);
1126 1145          err = dmu_tx_assign(tx, TXG_WAIT);
1127 1146          if (err) {
1128 1147                  dmu_tx_abort(tx);
1129 1148                  return (err);
1130 1149          }
1131 1150  
1132 1151          dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
1133 1152              tx);
1134 1153          dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
1135 1154  
1136 1155          if (data != NULL) {
1137 1156                  dmu_buf_t *db;
1138 1157  
1139 1158                  VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db));
1140 1159                  dmu_buf_will_dirty(db, tx);
1141 1160  
1142 1161                  ASSERT3U(db->db_size, >=, drro->drr_bonuslen);
1143 1162                  bcopy(data, db->db_data, drro->drr_bonuslen);
1144 1163                  if (ra->byteswap) {
1145 1164                          dmu_object_byteswap_t byteswap =
1146 1165                              DMU_OT_BYTESWAP(drro->drr_bonustype);
1147 1166                          dmu_ot_byteswap[byteswap].ob_func(db->db_data,
1148 1167                              drro->drr_bonuslen);
1149 1168                  }
1150 1169                  dmu_buf_rele(db, FTAG);
1151 1170          }
1152 1171          dmu_tx_commit(tx);
1153 1172          return (0);
1154 1173  }
1155 1174  
1156 1175  /* ARGSUSED */
1157 1176  static int
1158 1177  restore_freeobjects(struct restorearg *ra, objset_t *os,
1159 1178      struct drr_freeobjects *drrfo)
1160 1179  {
1161 1180          uint64_t obj;
1162 1181  
1163 1182          if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj)
1164 1183                  return (EINVAL);
1165 1184  
1166 1185          for (obj = drrfo->drr_firstobj;
1167 1186              obj < drrfo->drr_firstobj + drrfo->drr_numobjs;
1168 1187              (void) dmu_object_next(os, &obj, FALSE, 0)) {
1169 1188                  int err;
1170 1189  
1171 1190                  if (dmu_object_info(os, obj, NULL) != 0)
1172 1191                          continue;
1173 1192  
1174 1193                  err = dmu_free_object(os, obj);
1175 1194                  if (err)
1176 1195                          return (err);
1177 1196          }
1178 1197          return (0);
1179 1198  }
1180 1199  
1181 1200  static int
1182 1201  restore_write(struct restorearg *ra, objset_t *os,
1183 1202      struct drr_write *drrw)
1184 1203  {
1185 1204          dmu_tx_t *tx;
1186 1205          void *data;
1187 1206          int err;
1188 1207  
1189 1208          if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset ||
1190 1209              !DMU_OT_IS_VALID(drrw->drr_type))
1191 1210                  return (EINVAL);
1192 1211  
1193 1212          data = restore_read(ra, drrw->drr_length);
1194 1213          if (data == NULL)
1195 1214                  return (ra->err);
1196 1215  
1197 1216          if (dmu_object_info(os, drrw->drr_object, NULL) != 0)
1198 1217                  return (EINVAL);
1199 1218  
1200 1219          tx = dmu_tx_create(os);
1201 1220  
1202 1221          dmu_tx_hold_write(tx, drrw->drr_object,
1203 1222              drrw->drr_offset, drrw->drr_length);
1204 1223          err = dmu_tx_assign(tx, TXG_WAIT);
1205 1224          if (err) {
1206 1225                  dmu_tx_abort(tx);
1207 1226                  return (err);
1208 1227          }
1209 1228          if (ra->byteswap) {
1210 1229                  dmu_object_byteswap_t byteswap =
1211 1230                      DMU_OT_BYTESWAP(drrw->drr_type);
1212 1231                  dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length);
1213 1232          }
1214 1233          dmu_write(os, drrw->drr_object,
1215 1234              drrw->drr_offset, drrw->drr_length, data, tx);
1216 1235          dmu_tx_commit(tx);
1217 1236          return (0);
1218 1237  }
1219 1238  
1220 1239  /*
1221 1240   * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
1222 1241   * streams to refer to a copy of the data that is already on the
1223 1242   * system because it came in earlier in the stream.  This function
1224 1243   * finds the earlier copy of the data, and uses that copy instead of
1225 1244   * data from the stream to fulfill this write.
1226 1245   */
1227 1246  static int
1228 1247  restore_write_byref(struct restorearg *ra, objset_t *os,
1229 1248      struct drr_write_byref *drrwbr)
1230 1249  {
1231 1250          dmu_tx_t *tx;
1232 1251          int err;
1233 1252          guid_map_entry_t gmesrch;
1234 1253          guid_map_entry_t *gmep;
1235 1254          avl_index_t     where;
1236 1255          objset_t *ref_os = NULL;
1237 1256          dmu_buf_t *dbp;
1238 1257  
1239 1258          if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
1240 1259                  return (EINVAL);
1241 1260  
1242 1261          /*
1243 1262           * If the GUID of the referenced dataset is different from the
1244 1263           * GUID of the target dataset, find the referenced dataset.
1245 1264           */
1246 1265          if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
1247 1266                  gmesrch.guid = drrwbr->drr_refguid;
1248 1267                  if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch,
1249 1268                      &where)) == NULL) {
1250 1269                          return (EINVAL);
1251 1270                  }
1252 1271                  if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
1253 1272                          return (EINVAL);
1254 1273          } else {
1255 1274                  ref_os = os;
1256 1275          }
1257 1276  
1258 1277          if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
1259 1278              drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH))
1260 1279                  return (err);
1261 1280  
1262 1281          tx = dmu_tx_create(os);
1263 1282  
1264 1283          dmu_tx_hold_write(tx, drrwbr->drr_object,
1265 1284              drrwbr->drr_offset, drrwbr->drr_length);
1266 1285          err = dmu_tx_assign(tx, TXG_WAIT);
1267 1286          if (err) {
1268 1287                  dmu_tx_abort(tx);
1269 1288                  return (err);
1270 1289          }
1271 1290          dmu_write(os, drrwbr->drr_object,
1272 1291              drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
1273 1292          dmu_buf_rele(dbp, FTAG);
1274 1293          dmu_tx_commit(tx);
1275 1294          return (0);
1276 1295  }
1277 1296  
1278 1297  static int
1279 1298  restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs)
1280 1299  {
1281 1300          dmu_tx_t *tx;
1282 1301          void *data;
1283 1302          dmu_buf_t *db, *db_spill;
1284 1303          int err;
1285 1304  
1286 1305          if (drrs->drr_length < SPA_MINBLOCKSIZE ||
1287 1306              drrs->drr_length > SPA_MAXBLOCKSIZE)
1288 1307                  return (EINVAL);
1289 1308  
1290 1309          data = restore_read(ra, drrs->drr_length);
1291 1310          if (data == NULL)
1292 1311                  return (ra->err);
1293 1312  
1294 1313          if (dmu_object_info(os, drrs->drr_object, NULL) != 0)
1295 1314                  return (EINVAL);
1296 1315  
1297 1316          VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db));
1298 1317          if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) {
1299 1318                  dmu_buf_rele(db, FTAG);
1300 1319                  return (err);
1301 1320          }
1302 1321  
1303 1322          tx = dmu_tx_create(os);
1304 1323  
1305 1324          dmu_tx_hold_spill(tx, db->db_object);
1306 1325  
1307 1326          err = dmu_tx_assign(tx, TXG_WAIT);
1308 1327          if (err) {
1309 1328                  dmu_buf_rele(db, FTAG);
1310 1329                  dmu_buf_rele(db_spill, FTAG);
1311 1330                  dmu_tx_abort(tx);
1312 1331                  return (err);
1313 1332          }
1314 1333          dmu_buf_will_dirty(db_spill, tx);
1315 1334  
1316 1335          if (db_spill->db_size < drrs->drr_length)
1317 1336                  VERIFY(0 == dbuf_spill_set_blksz(db_spill,
1318 1337                      drrs->drr_length, tx));
1319 1338          bcopy(data, db_spill->db_data, drrs->drr_length);
1320 1339  
1321 1340          dmu_buf_rele(db, FTAG);
1322 1341          dmu_buf_rele(db_spill, FTAG);
1323 1342  
1324 1343          dmu_tx_commit(tx);
1325 1344          return (0);
1326 1345  }
1327 1346  
1328 1347  /* ARGSUSED */
1329 1348  static int
1330 1349  restore_free(struct restorearg *ra, objset_t *os,
1331 1350      struct drr_free *drrf)
1332 1351  {
1333 1352          int err;
1334 1353  
1335 1354          if (drrf->drr_length != -1ULL &&
1336 1355              drrf->drr_offset + drrf->drr_length < drrf->drr_offset)
1337 1356                  return (EINVAL);
1338 1357  
1339 1358          if (dmu_object_info(os, drrf->drr_object, NULL) != 0)
1340 1359                  return (EINVAL);
1341 1360  
1342 1361          err = dmu_free_long_range(os, drrf->drr_object,
1343 1362              drrf->drr_offset, drrf->drr_length);
1344 1363          return (err);
1345 1364  }
1346 1365  
1347 1366  /*
1348 1367   * NB: callers *must* call dmu_recv_end() if this succeeds.
1349 1368   */
1350 1369  int
1351 1370  dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp,
1352 1371      int cleanup_fd, uint64_t *action_handlep)
1353 1372  {
1354 1373          struct restorearg ra = { 0 };
1355 1374          dmu_replay_record_t *drr;
1356 1375          objset_t *os;
1357 1376          zio_cksum_t pcksum;
1358 1377          int featureflags;
1359 1378  
1360 1379          if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
1361 1380                  ra.byteswap = TRUE;
1362 1381  
1363 1382          {
1364 1383                  /* compute checksum of drr_begin record */
1365 1384                  dmu_replay_record_t *drr;
1366 1385                  drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
1367 1386  
1368 1387                  drr->drr_type = DRR_BEGIN;
1369 1388                  drr->drr_u.drr_begin = *drc->drc_drrb;
1370 1389                  if (ra.byteswap) {
1371 1390                          fletcher_4_incremental_byteswap(drr,
1372 1391                              sizeof (dmu_replay_record_t), &ra.cksum);
1373 1392                  } else {
1374 1393                          fletcher_4_incremental_native(drr,
1375 1394                              sizeof (dmu_replay_record_t), &ra.cksum);
1376 1395                  }
1377 1396                  kmem_free(drr, sizeof (dmu_replay_record_t));
1378 1397          }
1379 1398  
1380 1399          if (ra.byteswap) {
1381 1400                  struct drr_begin *drrb = drc->drc_drrb;
1382 1401                  drrb->drr_magic = BSWAP_64(drrb->drr_magic);
1383 1402                  drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
1384 1403                  drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
1385 1404                  drrb->drr_type = BSWAP_32(drrb->drr_type);
1386 1405                  drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
1387 1406                  drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid);
1388 1407          }
1389 1408  
1390 1409          ra.vp = vp;
1391 1410          ra.voff = *voffp;
1392 1411          ra.bufsize = 1<<20;
1393 1412          ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
1394 1413  
1395 1414          /* these were verified in dmu_recv_begin */
1396 1415          ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
1397 1416              DMU_SUBSTREAM);
1398 1417          ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
1399 1418  
1400 1419          /*
1401 1420           * Open the objset we are modifying.
1402 1421           */
1403 1422          VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
1404 1423  
1405 1424          ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
1406 1425  
1407 1426          featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
1408 1427  
1409 1428          /* if this stream is dedup'ed, set up the avl tree for guid mapping */
1410 1429          if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
1411 1430                  minor_t minor;
1412 1431  
1413 1432                  if (cleanup_fd == -1) {
1414 1433                          ra.err = EBADF;
1415 1434                          goto out;
1416 1435                  }
1417 1436                  ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor);
1418 1437                  if (ra.err) {
1419 1438                          cleanup_fd = -1;
1420 1439                          goto out;
1421 1440                  }
1422 1441  
1423 1442                  if (*action_handlep == 0) {
1424 1443                          ra.guid_to_ds_map =
1425 1444                              kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
1426 1445                          avl_create(ra.guid_to_ds_map, guid_compare,
1427 1446                              sizeof (guid_map_entry_t),
1428 1447                              offsetof(guid_map_entry_t, avlnode));
1429 1448                          ra.err = zfs_onexit_add_cb(minor,
1430 1449                              free_guid_map_onexit, ra.guid_to_ds_map,
1431 1450                              action_handlep);
1432 1451                          if (ra.err)
1433 1452                                  goto out;
1434 1453                  } else {
1435 1454                          ra.err = zfs_onexit_cb_data(minor, *action_handlep,
1436 1455                              (void **)&ra.guid_to_ds_map);
1437 1456                          if (ra.err)
1438 1457                                  goto out;
1439 1458                  }
1440 1459  
1441 1460                  drc->drc_guid_to_ds_map = ra.guid_to_ds_map;
1442 1461          }
1443 1462  
1444 1463          /*
1445 1464           * Read records and process them.
1446 1465           */
1447 1466          pcksum = ra.cksum;
1448 1467          while (ra.err == 0 &&
1449 1468              NULL != (drr = restore_read(&ra, sizeof (*drr)))) {
1450 1469                  if (issig(JUSTLOOKING) && issig(FORREAL)) {
1451 1470                          ra.err = EINTR;
1452 1471                          goto out;
1453 1472                  }
1454 1473  
1455 1474                  if (ra.byteswap)
1456 1475                          backup_byteswap(drr);
1457 1476  
1458 1477                  switch (drr->drr_type) {
1459 1478                  case DRR_OBJECT:
1460 1479                  {
1461 1480                          /*
1462 1481                           * We need to make a copy of the record header,
1463 1482                           * because restore_{object,write} may need to
1464 1483                           * restore_read(), which will invalidate drr.
1465 1484                           */
1466 1485                          struct drr_object drro = drr->drr_u.drr_object;
1467 1486                          ra.err = restore_object(&ra, os, &drro);
1468 1487                          break;
1469 1488                  }
1470 1489                  case DRR_FREEOBJECTS:
1471 1490                  {
1472 1491                          struct drr_freeobjects drrfo =
1473 1492                              drr->drr_u.drr_freeobjects;
1474 1493                          ra.err = restore_freeobjects(&ra, os, &drrfo);
1475 1494                          break;
1476 1495                  }
1477 1496                  case DRR_WRITE:
1478 1497                  {
1479 1498                          struct drr_write drrw = drr->drr_u.drr_write;
1480 1499                          ra.err = restore_write(&ra, os, &drrw);
1481 1500                          break;
1482 1501                  }
1483 1502                  case DRR_WRITE_BYREF:
1484 1503                  {
1485 1504                          struct drr_write_byref drrwbr =
1486 1505                              drr->drr_u.drr_write_byref;
1487 1506                          ra.err = restore_write_byref(&ra, os, &drrwbr);
1488 1507                          break;
1489 1508                  }
1490 1509                  case DRR_FREE:
1491 1510                  {
1492 1511                          struct drr_free drrf = drr->drr_u.drr_free;
1493 1512                          ra.err = restore_free(&ra, os, &drrf);
1494 1513                          break;
1495 1514                  }
1496 1515                  case DRR_END:
1497 1516                  {
1498 1517                          struct drr_end drre = drr->drr_u.drr_end;
1499 1518                          /*
1500 1519                           * We compare against the *previous* checksum
1501 1520                           * value, because the stored checksum is of
1502 1521                           * everything before the DRR_END record.
1503 1522                           */
1504 1523                          if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum))
1505 1524                                  ra.err = ECKSUM;
1506 1525                          goto out;
1507 1526                  }
1508 1527                  case DRR_SPILL:
1509 1528                  {
1510 1529                          struct drr_spill drrs = drr->drr_u.drr_spill;
1511 1530                          ra.err = restore_spill(&ra, os, &drrs);
1512 1531                          break;
1513 1532                  }
1514 1533                  default:
1515 1534                          ra.err = EINVAL;
1516 1535                          goto out;
1517 1536                  }
1518 1537                  pcksum = ra.cksum;
1519 1538          }
1520 1539          ASSERT(ra.err != 0);
1521 1540  
1522 1541  out:
1523 1542          if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1))
1524 1543                  zfs_onexit_fd_rele(cleanup_fd);
1525 1544  
1526 1545          if (ra.err != 0) {
1527 1546                  /*
1528 1547                   * destroy what we created, so we don't leave it in the
1529 1548                   * inconsistent restoring state.
1530 1549                   */
1531 1550                  txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
1532 1551  
1533 1552                  (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1534 1553                      B_FALSE);
1535 1554                  if (drc->drc_real_ds != drc->drc_logical_ds) {
1536 1555                          mutex_exit(&drc->drc_logical_ds->ds_recvlock);
1537 1556                          dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
1538 1557                  }
1539 1558          }
1540 1559  
1541 1560          kmem_free(ra.buf, ra.bufsize);
1542 1561          *voffp = ra.voff;
1543 1562          return (ra.err);
1544 1563  }
1545 1564  
1546 1565  struct recvendsyncarg {
1547 1566          char *tosnap;
1548 1567          uint64_t creation_time;
1549 1568          uint64_t toguid;
1550 1569  };
1551 1570  
1552 1571  static int
1553 1572  recv_end_check(void *arg1, void *arg2, dmu_tx_t *tx)
1554 1573  {
1555 1574          dsl_dataset_t *ds = arg1;
1556 1575          struct recvendsyncarg *resa = arg2;
1557 1576  
1558 1577          return (dsl_dataset_snapshot_check(ds, resa->tosnap, tx));
1559 1578  }
1560 1579  
1561 1580  static void
1562 1581  recv_end_sync(void *arg1, void *arg2, dmu_tx_t *tx)
1563 1582  {
1564 1583          dsl_dataset_t *ds = arg1;
1565 1584          struct recvendsyncarg *resa = arg2;
1566 1585  
1567 1586          dsl_dataset_snapshot_sync(ds, resa->tosnap, tx);
1568 1587  
1569 1588          /* set snapshot's creation time and guid */
1570 1589          dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
1571 1590          ds->ds_prev->ds_phys->ds_creation_time = resa->creation_time;
1572 1591          ds->ds_prev->ds_phys->ds_guid = resa->toguid;
1573 1592          ds->ds_prev->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1574 1593  
1575 1594          dmu_buf_will_dirty(ds->ds_dbuf, tx);
1576 1595          ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
1577 1596          spa_history_log_internal_ds(ds, "finished receiving", tx, "");
1578 1597  }
1579 1598  
1580 1599  static int
1581 1600  add_ds_to_guidmap(avl_tree_t *guid_map, dsl_dataset_t *ds)
1582 1601  {
1583 1602          dsl_pool_t *dp = ds->ds_dir->dd_pool;
1584 1603          uint64_t snapobj = ds->ds_phys->ds_prev_snap_obj;
1585 1604          dsl_dataset_t *snapds;
1586 1605          guid_map_entry_t *gmep;
1587 1606          int err;
1588 1607  
1589 1608          ASSERT(guid_map != NULL);
1590 1609  
1591 1610          rw_enter(&dp->dp_config_rwlock, RW_READER);
1592 1611          err = dsl_dataset_hold_obj(dp, snapobj, guid_map, &snapds);
1593 1612          if (err == 0) {
1594 1613                  gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
1595 1614                  gmep->guid = snapds->ds_phys->ds_guid;
1596 1615                  gmep->gme_ds = snapds;
1597 1616                  avl_add(guid_map, gmep);
1598 1617          }
1599 1618  
1600 1619          rw_exit(&dp->dp_config_rwlock);
1601 1620          return (err);
1602 1621  }
1603 1622  
1604 1623  static int
1605 1624  dmu_recv_existing_end(dmu_recv_cookie_t *drc)
1606 1625  {
1607 1626          struct recvendsyncarg resa;
1608 1627          dsl_dataset_t *ds = drc->drc_logical_ds;
1609 1628          int err, myerr;
1610 1629  
1611 1630          /*
1612 1631           * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1613 1632           * expects it to have a ds_user_ptr (and zil), but clone_swap()
1614 1633           * can close it.
1615 1634           */
1616 1635          txg_wait_synced(ds->ds_dir->dd_pool, 0);
1617 1636  
1618 1637          if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
1619 1638                  err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
1620 1639                      drc->drc_force);
1621 1640                  if (err)
1622 1641                          goto out;
1623 1642          } else {
1624 1643                  mutex_exit(&ds->ds_recvlock);
1625 1644                  dsl_dataset_rele(ds, dmu_recv_tag);
1626 1645                  (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
1627 1646                      B_FALSE);
1628 1647                  return (EBUSY);
1629 1648          }
1630 1649  
1631 1650          resa.creation_time = drc->drc_drrb->drr_creation_time;
1632 1651          resa.toguid = drc->drc_drrb->drr_toguid;
1633 1652          resa.tosnap = drc->drc_tosnap;
1634 1653  
1635 1654          err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1636 1655              recv_end_check, recv_end_sync, ds, &resa, 3);
1637 1656          if (err) {
1638 1657                  /* swap back */
1639 1658                  (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
1640 1659          }
1641 1660  
1642 1661  out:
1643 1662          mutex_exit(&ds->ds_recvlock);
1644 1663          if (err == 0 && drc->drc_guid_to_ds_map != NULL)
1645 1664                  (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1646 1665          dsl_dataset_disown(ds, dmu_recv_tag);
1647 1666          myerr = dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
1648 1667          ASSERT3U(myerr, ==, 0);
1649 1668          return (err);
1650 1669  }
1651 1670  
1652 1671  static int
1653 1672  dmu_recv_new_end(dmu_recv_cookie_t *drc)
1654 1673  {
1655 1674          struct recvendsyncarg resa;
1656 1675          dsl_dataset_t *ds = drc->drc_logical_ds;
1657 1676          int err;
1658 1677  
1659 1678          /*
1660 1679           * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
1661 1680           * expects it to have a ds_user_ptr (and zil), but clone_swap()
1662 1681           * can close it.
1663 1682           */
1664 1683          txg_wait_synced(ds->ds_dir->dd_pool, 0);
1665 1684  
1666 1685          resa.creation_time = drc->drc_drrb->drr_creation_time;
1667 1686          resa.toguid = drc->drc_drrb->drr_toguid;
1668 1687          resa.tosnap = drc->drc_tosnap;
1669 1688  
1670 1689          err = dsl_sync_task_do(ds->ds_dir->dd_pool,
1671 1690              recv_end_check, recv_end_sync, ds, &resa, 3);
1672 1691          if (err) {
1673 1692                  /* clean up the fs we just recv'd into */
1674 1693                  (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
1675 1694          } else {
1676 1695                  if (drc->drc_guid_to_ds_map != NULL)
1677 1696                          (void) add_ds_to_guidmap(drc->drc_guid_to_ds_map, ds);
1678 1697                  /* release the hold from dmu_recv_begin */
1679 1698                  dsl_dataset_disown(ds, dmu_recv_tag);
1680 1699          }
1681 1700          return (err);
1682 1701  }
1683 1702  
1684 1703  int
1685 1704  dmu_recv_end(dmu_recv_cookie_t *drc)
1686 1705  {
1687 1706          if (drc->drc_logical_ds != drc->drc_real_ds)
1688 1707                  return (dmu_recv_existing_end(drc));
1689 1708          else
1690 1709                  return (dmu_recv_new_end(drc));
1691 1710  }
  
    | 
      ↓ open down ↓ | 
    1171 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX