1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23  * Copyright (c) 2013, 2015 by Delphix. All rights reserved.
  24  */
  25 
  26 #include <sys/types.h>
  27 #include <sys/param.h>
  28 #include <sys/systm.h>
  29 #include <sys/sysmacros.h>
  30 #include <sys/cmn_err.h>
  31 #include <sys/kmem.h>
  32 #include <sys/thread.h>
  33 #include <sys/file.h>
  34 #include <sys/fcntl.h>
  35 #include <sys/vfs.h>
  36 #include <sys/fs/zfs.h>
  37 #include <sys/zfs_znode.h>
  38 #include <sys/zfs_dir.h>
  39 #include <sys/zfs_acl.h>
  40 #include <sys/zfs_fuid.h>
  41 #include <sys/spa.h>
  42 #include <sys/zil.h>
  43 #include <sys/byteorder.h>
  44 #include <sys/stat.h>
  45 #include <sys/mode.h>
  46 #include <sys/acl.h>
  47 #include <sys/atomic.h>
  48 #include <sys/cred.h>
  49 
  50 /*
  51  * Functions to replay ZFS intent log (ZIL) records
  52  * The functions are called through a function vector (zfs_replay_vector)
  53  * which is indexed by the transaction type.
  54  */
  55 
  56 static void
  57 zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode,
  58     uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid)
  59 {
  60         bzero(vap, sizeof (*vap));
  61         vap->va_mask = (uint_t)mask;
  62         vap->va_type = IFTOVT(mode);
  63         vap->va_mode = mode & MODEMASK;
  64         vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid;
  65         vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid;
  66         vap->va_rdev = zfs_cmpldev(rdev);
  67         vap->va_nodeid = nodeid;
  68 }
  69 
  70 /* ARGSUSED */
  71 static int
  72 zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap)
  73 {
  74         return (SET_ERROR(ENOTSUP));
  75 }
  76 
  77 static void
  78 zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
  79 {
  80         xoptattr_t *xoap = NULL;
  81         uint64_t *attrs;
  82         uint64_t *crtime;
  83         uint32_t *bitmap;
  84         void *scanstamp;
  85         int i;
  86 
  87         xvap->xva_vattr.va_mask |= AT_XVATTR;
  88         if ((xoap = xva_getxoptattr(xvap)) == NULL) {
  89                 xvap->xva_vattr.va_mask &= ~AT_XVATTR; /* shouldn't happen */
  90                 return;
  91         }
  92 
  93         ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize);
  94 
  95         bitmap = &lrattr->lr_attr_bitmap;
  96         for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++)
  97                 xvap->xva_reqattrmap[i] = *bitmap;
  98 
  99         attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1);
 100         crtime = attrs + 1;
 101         scanstamp = (caddr_t)(crtime + 2);
 102 
 103         if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
 104                 xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0);
 105         if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
 106                 xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0);
 107         if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
 108                 xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0);
 109         if (XVA_ISSET_REQ(xvap, XAT_READONLY))
 110                 xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0);
 111         if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
 112                 xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0);
 113         if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
 114                 xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0);
 115         if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
 116                 xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0);
 117         if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
 118                 xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0);
 119         if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
 120                 xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0);
 121         if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
 122                 xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0);
 123         if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
 124                 xoap->xoa_av_quarantined =
 125                     ((*attrs & XAT0_AV_QUARANTINED) != 0);
 126         if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
 127                 ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
 128         if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 129                 ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID));
 130 
 131                 bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
 132         } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) {
 133                 /*
 134                  * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid
 135                  * at the same time, so we can share the same space.
 136                  */
 137                 bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t));
 138         }
 139         if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
 140                 xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
 141         if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
 142                 xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0);
 143         if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
 144                 xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0);
 145         if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT))
 146                 xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0);
 147 }
 148 
 149 static int
 150 zfs_replay_domain_cnt(uint64_t uid, uint64_t gid)
 151 {
 152         uint64_t uid_idx;
 153         uint64_t gid_idx;
 154         int domcnt = 0;
 155 
 156         uid_idx = FUID_INDEX(uid);
 157         gid_idx = FUID_INDEX(gid);
 158         if (uid_idx)
 159                 domcnt++;
 160         if (gid_idx > 0 && gid_idx != uid_idx)
 161                 domcnt++;
 162 
 163         return (domcnt);
 164 }
 165 
 166 static void *
 167 zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start,
 168     int domcnt)
 169 {
 170         int i;
 171 
 172         for (i = 0; i != domcnt; i++) {
 173                 fuid_infop->z_domain_table[i] = start;
 174                 start = (caddr_t)start + strlen(start) + 1;
 175         }
 176 
 177         return (start);
 178 }
 179 
 180 /*
 181  * Set the uid/gid in the fuid_info structure.
 182  */
 183 static void
 184 zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid)
 185 {
 186         /*
 187          * If owner or group are log specific FUIDs then slurp up
 188          * domain information and build zfs_fuid_info_t
 189          */
 190         if (IS_EPHEMERAL(uid))
 191                 fuid_infop->z_fuid_owner = uid;
 192 
 193         if (IS_EPHEMERAL(gid))
 194                 fuid_infop->z_fuid_group = gid;
 195 }
 196 
 197 /*
 198  * Load fuid domains into fuid_info_t
 199  */
 200 static zfs_fuid_info_t *
 201 zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid)
 202 {
 203         int domcnt;
 204 
 205         zfs_fuid_info_t *fuid_infop;
 206 
 207         fuid_infop = zfs_fuid_info_alloc();
 208 
 209         domcnt = zfs_replay_domain_cnt(uid, gid);
 210 
 211         if (domcnt == 0)
 212                 return (fuid_infop);
 213 
 214         fuid_infop->z_domain_table =
 215             kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
 216 
 217         zfs_replay_fuid_ugid(fuid_infop, uid, gid);
 218 
 219         fuid_infop->z_domain_cnt = domcnt;
 220         *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt);
 221         return (fuid_infop);
 222 }
 223 
 224 /*
 225  * load zfs_fuid_t's and fuid_domains into fuid_info_t
 226  */
 227 static zfs_fuid_info_t *
 228 zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid,
 229     uint64_t gid)
 230 {
 231         uint64_t *log_fuid = (uint64_t *)start;
 232         zfs_fuid_info_t *fuid_infop;
 233         int i;
 234 
 235         fuid_infop = zfs_fuid_info_alloc();
 236         fuid_infop->z_domain_cnt = domcnt;
 237 
 238         fuid_infop->z_domain_table =
 239             kmem_zalloc(domcnt * sizeof (char **), KM_SLEEP);
 240 
 241         for (i = 0; i != idcnt; i++) {
 242                 zfs_fuid_t *zfuid;
 243 
 244                 zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP);
 245                 zfuid->z_logfuid = *log_fuid;
 246                 zfuid->z_id = -1;
 247                 zfuid->z_domidx = 0;
 248                 list_insert_tail(&fuid_infop->z_fuids, zfuid);
 249                 log_fuid++;
 250         }
 251 
 252         zfs_replay_fuid_ugid(fuid_infop, uid, gid);
 253 
 254         *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt);
 255         return (fuid_infop);
 256 }
 257 
 258 static void
 259 zfs_replay_swap_attrs(lr_attr_t *lrattr)
 260 {
 261         /* swap the lr_attr structure */
 262         byteswap_uint32_array(lrattr, sizeof (*lrattr));
 263         /* swap the bitmap */
 264         byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) *
 265             sizeof (uint32_t));
 266         /* swap the attributes, create time + 64 bit word for attributes */
 267         byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) *
 268             (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t));
 269 }
 270 
 271 /*
 272  * Replay file create with optional ACL, xvattr information as well
 273  * as option FUID information.
 274  */
 275 static int
 276 zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap)
 277 {
 278         zfsvfs_t *zfsvfs = arg1;
 279         lr_acl_create_t *lracl = arg2;
 280         char *name = NULL;              /* location determined later */
 281         lr_create_t *lr = (lr_create_t *)lracl;
 282         znode_t *dzp;
 283         vnode_t *vp = NULL;
 284         xvattr_t xva;
 285         int vflg = 0;
 286         vsecattr_t vsec = { 0 };
 287         lr_attr_t *lrattr;
 288         void *aclstart;
 289         void *fuidstart;
 290         size_t xvatlen = 0;
 291         uint64_t txtype;
 292         uint64_t objid;
 293         uint64_t dnodesize;
 294         int error;
 295 
 296         txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 297         if (byteswap) {
 298                 byteswap_uint64_array(lracl, sizeof (*lracl));
 299                 if (txtype == TX_CREATE_ACL_ATTR ||
 300                     txtype == TX_MKDIR_ACL_ATTR) {
 301                         lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 302                         zfs_replay_swap_attrs(lrattr);
 303                         xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 304                 }
 305 
 306                 aclstart = (caddr_t)(lracl + 1) + xvatlen;
 307                 zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE);
 308                 /* swap fuids */
 309                 if (lracl->lr_fuidcnt) {
 310                         byteswap_uint64_array((caddr_t)aclstart +
 311                             ZIL_ACE_LENGTH(lracl->lr_acl_bytes),
 312                             lracl->lr_fuidcnt * sizeof (uint64_t));
 313                 }
 314         }
 315 
 316         if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 317                 return (error);
 318 
 319         objid = LR_FOID_GET_OBJ(lr->lr_foid);
 320         dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
 321 
 322         xva_init(&xva);
 323         zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
 324             lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
 325 
 326         /*
 327          * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 328          * eventually end up in zfs_mknode(), which assigns the object's
 329          * creation time, generation number, and dnode size. The generic
 330          * zfs_create() has no concept of these attributes, so we smuggle
 331          * the values inside the vattr's otherwise unused va_ctime,
 332          * va_nblocks, and va_fsid fields.
 333          */
 334         ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
 335         xva.xva_vattr.va_nblocks = lr->lr_gen;
 336         xva.xva_vattr.va_fsid = dnodesize;
 337 
 338         error = dmu_object_info(zfsvfs->z_os, lr->lr_foid, NULL);
 339         if (error != ENOENT)
 340                 goto bail;
 341 
 342         if (lr->lr_common.lrc_txtype & TX_CI)
 343                 vflg |= FIGNORECASE;
 344         switch (txtype) {
 345         case TX_CREATE_ACL:
 346                 aclstart = (caddr_t)(lracl + 1);
 347                 fuidstart = (caddr_t)aclstart +
 348                     ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 349                 zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 350                     (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 351                     lr->lr_uid, lr->lr_gid);
 352                 /*FALLTHROUGH*/
 353         case TX_CREATE_ACL_ATTR:
 354                 if (name == NULL) {
 355                         lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 356                         xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 357                         xva.xva_vattr.va_mask |= AT_XVATTR;
 358                         zfs_replay_xvattr(lrattr, &xva);
 359                 }
 360                 vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
 361                 vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
 362                 vsec.vsa_aclcnt = lracl->lr_aclcnt;
 363                 vsec.vsa_aclentsz = lracl->lr_acl_bytes;
 364                 vsec.vsa_aclflags = lracl->lr_acl_flags;
 365                 if (zfsvfs->z_fuid_replay == NULL) {
 366                         fuidstart = (caddr_t)(lracl + 1) + xvatlen +
 367                             ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 368                         zfsvfs->z_fuid_replay =
 369                             zfs_replay_fuids(fuidstart,
 370                             (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 371                             lr->lr_uid, lr->lr_gid);
 372                 }
 373 
 374                 error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
 375                     0, 0, &vp, kcred, vflg, NULL, &vsec);
 376                 break;
 377         case TX_MKDIR_ACL:
 378                 aclstart = (caddr_t)(lracl + 1);
 379                 fuidstart = (caddr_t)aclstart +
 380                     ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 381                 zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart,
 382                     (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 383                     lr->lr_uid, lr->lr_gid);
 384                 /*FALLTHROUGH*/
 385         case TX_MKDIR_ACL_ATTR:
 386                 if (name == NULL) {
 387                         lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
 388                         xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 389                         zfs_replay_xvattr(lrattr, &xva);
 390                 }
 391                 vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS;
 392                 vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen;
 393                 vsec.vsa_aclcnt = lracl->lr_aclcnt;
 394                 vsec.vsa_aclentsz = lracl->lr_acl_bytes;
 395                 vsec.vsa_aclflags = lracl->lr_acl_flags;
 396                 if (zfsvfs->z_fuid_replay == NULL) {
 397                         fuidstart = (caddr_t)(lracl + 1) + xvatlen +
 398                             ZIL_ACE_LENGTH(lracl->lr_acl_bytes);
 399                         zfsvfs->z_fuid_replay =
 400                             zfs_replay_fuids(fuidstart,
 401                             (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt,
 402                             lr->lr_uid, lr->lr_gid);
 403                 }
 404                 error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
 405                     &vp, kcred, NULL, vflg, &vsec);
 406                 break;
 407         default:
 408                 error = SET_ERROR(ENOTSUP);
 409         }
 410 
 411 bail:
 412         if (error == 0 && vp != NULL)
 413                 VN_RELE(vp);
 414 
 415         VN_RELE(ZTOV(dzp));
 416 
 417         if (zfsvfs->z_fuid_replay)
 418                 zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 419         zfsvfs->z_fuid_replay = NULL;
 420 
 421         return (error);
 422 }
 423 
 424 static int
 425 zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap)
 426 {
 427         zfsvfs_t *zfsvfs = arg1;
 428         lr_create_t *lr = arg2;
 429         char *name = NULL;              /* location determined later */
 430         char *link;                     /* symlink content follows name */
 431         znode_t *dzp;
 432         vnode_t *vp = NULL;
 433         xvattr_t xva;
 434         int vflg = 0;
 435         size_t lrsize = sizeof (lr_create_t);
 436         lr_attr_t *lrattr;
 437         void *start;
 438         size_t xvatlen;
 439         uint64_t txtype;
 440         int error;
 441 
 442         txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 443         if (byteswap) {
 444                 byteswap_uint64_array(lr, sizeof (*lr));
 445                 if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
 446                         zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 447         }
 448 
 449 
 450         if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 451                 return (error);
 452 
 453         uint64_t objid = LR_FOID_GET_OBJ(lr->lr_foid);
 454         int dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT;
 455 
 456         xva_init(&xva);
 457         zfs_init_vattr(&xva.xva_vattr, AT_TYPE | AT_MODE | AT_UID | AT_GID,
 458             lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid);
 459 
 460         /*
 461          * All forms of zfs create (create, mkdir, mkxattrdir, symlink)
 462          * eventually end up in zfs_mknode(), which assigns the object's
 463          * creation time, generation number, and dnode slot count. The
 464          * generic zfs_create() has no concept of these attributes, so
 465          * we smuggle the values inside the vattr's otherwise unused
 466          * va_ctime, va_nblocks and va_fsid fields.
 467          */
 468         ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime);
 469         xva.xva_vattr.va_nblocks = lr->lr_gen;
 470         xva.xva_vattr.va_fsid = dnodesize;
 471 
 472         error = dmu_object_info(zfsvfs->z_os, objid, NULL);
 473         if (error != ENOENT)
 474                 goto out;
 475 
 476         if (lr->lr_common.lrc_txtype & TX_CI)
 477                 vflg |= FIGNORECASE;
 478 
 479         /*
 480          * Symlinks don't have fuid info, and CIFS never creates
 481          * symlinks.
 482          *
 483          * The _ATTR versions will grab the fuid info in their subcases.
 484          */
 485         if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK &&
 486             (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR &&
 487             (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) {
 488                 start = (lr + 1);
 489                 zfsvfs->z_fuid_replay =
 490                     zfs_replay_fuid_domain(start, &start,
 491                     lr->lr_uid, lr->lr_gid);
 492         }
 493 
 494         switch (txtype) {
 495         case TX_CREATE_ATTR:
 496                 lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 497                 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 498                 zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
 499                 start = (caddr_t)(lr + 1) + xvatlen;
 500                 zfsvfs->z_fuid_replay =
 501                     zfs_replay_fuid_domain(start, &start,
 502                     lr->lr_uid, lr->lr_gid);
 503                 name = (char *)start;
 504 
 505                 /*FALLTHROUGH*/
 506         case TX_CREATE:
 507                 if (name == NULL)
 508                         name = (char *)start;
 509 
 510                 error = VOP_CREATE(ZTOV(dzp), name, &xva.xva_vattr,
 511                     0, 0, &vp, kcred, vflg, NULL, NULL);
 512                 break;
 513         case TX_MKDIR_ATTR:
 514                 lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 515                 xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
 516                 zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva);
 517                 start = (caddr_t)(lr + 1) + xvatlen;
 518                 zfsvfs->z_fuid_replay =
 519                     zfs_replay_fuid_domain(start, &start,
 520                     lr->lr_uid, lr->lr_gid);
 521                 name = (char *)start;
 522 
 523                 /*FALLTHROUGH*/
 524         case TX_MKDIR:
 525                 if (name == NULL)
 526                         name = (char *)(lr + 1);
 527 
 528                 error = VOP_MKDIR(ZTOV(dzp), name, &xva.xva_vattr,
 529                     &vp, kcred, NULL, vflg, NULL);
 530                 break;
 531         case TX_MKXATTR:
 532                 error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
 533                 break;
 534         case TX_SYMLINK:
 535                 name = (char *)(lr + 1);
 536                 link = name + strlen(name) + 1;
 537                 error = VOP_SYMLINK(ZTOV(dzp), name, &xva.xva_vattr,
 538                     link, kcred, NULL, vflg);
 539                 break;
 540         default:
 541                 error = SET_ERROR(ENOTSUP);
 542         }
 543 
 544 out:
 545         if (error == 0 && vp != NULL)
 546                 VN_RELE(vp);
 547 
 548         VN_RELE(ZTOV(dzp));
 549 
 550         if (zfsvfs->z_fuid_replay)
 551                 zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 552         zfsvfs->z_fuid_replay = NULL;
 553         return (error);
 554 }
 555 
 556 static int
 557 zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap)
 558 {
 559         zfsvfs_t *zfsvfs = arg1;
 560         lr_remove_t *lr = arg2;
 561         char *name = (char *)(lr + 1);  /* name follows lr_remove_t */
 562         znode_t *dzp;
 563         int error;
 564         int vflg = 0;
 565 
 566         if (byteswap)
 567                 byteswap_uint64_array(lr, sizeof (*lr));
 568 
 569         if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 570                 return (error);
 571 
 572         if (lr->lr_common.lrc_txtype & TX_CI)
 573                 vflg |= FIGNORECASE;
 574 
 575         switch ((int)lr->lr_common.lrc_txtype) {
 576         case TX_REMOVE:
 577                 error = VOP_REMOVE(ZTOV(dzp), name, kcred, NULL, vflg);
 578                 break;
 579         case TX_RMDIR:
 580                 error = VOP_RMDIR(ZTOV(dzp), name, NULL, kcred, NULL, vflg);
 581                 break;
 582         default:
 583                 error = SET_ERROR(ENOTSUP);
 584         }
 585 
 586         VN_RELE(ZTOV(dzp));
 587 
 588         return (error);
 589 }
 590 
 591 static int
 592 zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap)
 593 {
 594         zfsvfs_t *zfsvfs = arg1;
 595         lr_link_t *lr = arg2;
 596         char *name = (char *)(lr + 1);  /* name follows lr_link_t */
 597         znode_t *dzp, *zp;
 598         int error;
 599         int vflg = 0;
 600 
 601         if (byteswap)
 602                 byteswap_uint64_array(lr, sizeof (*lr));
 603 
 604         if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0)
 605                 return (error);
 606 
 607         if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) {
 608                 VN_RELE(ZTOV(dzp));
 609                 return (error);
 610         }
 611 
 612         if (lr->lr_common.lrc_txtype & TX_CI)
 613                 vflg |= FIGNORECASE;
 614 
 615         error = VOP_LINK(ZTOV(dzp), ZTOV(zp), name, kcred, NULL, vflg);
 616 
 617         VN_RELE(ZTOV(zp));
 618         VN_RELE(ZTOV(dzp));
 619 
 620         return (error);
 621 }
 622 
 623 static int
 624 zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap)
 625 {
 626         zfsvfs_t *zfsvfs = arg1;
 627         lr_rename_t *lr = arg2;
 628         char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */
 629         char *tname = sname + strlen(sname) + 1;
 630         znode_t *sdzp, *tdzp;
 631         int error;
 632         int vflg = 0;
 633 
 634         if (byteswap)
 635                 byteswap_uint64_array(lr, sizeof (*lr));
 636 
 637         if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0)
 638                 return (error);
 639 
 640         if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) {
 641                 VN_RELE(ZTOV(sdzp));
 642                 return (error);
 643         }
 644 
 645         if (lr->lr_common.lrc_txtype & TX_CI)
 646                 vflg |= FIGNORECASE;
 647 
 648         error = VOP_RENAME(ZTOV(sdzp), sname, ZTOV(tdzp), tname, kcred,
 649             NULL, vflg);
 650 
 651         VN_RELE(ZTOV(tdzp));
 652         VN_RELE(ZTOV(sdzp));
 653 
 654         return (error);
 655 }
 656 
 657 static int
 658 zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap)
 659 {
 660         zfsvfs_t *zfsvfs = arg1;
 661         lr_write_t *lr = arg2;
 662         char *data = (char *)(lr + 1);  /* data follows lr_write_t */
 663         znode_t *zp;
 664         int error;
 665         ssize_t resid;
 666         uint64_t eod, offset, length;
 667 
 668         if (byteswap)
 669                 byteswap_uint64_array(lr, sizeof (*lr));
 670 
 671         if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
 672                 /*
 673                  * As we can log writes out of order, it's possible the
 674                  * file has been removed. In this case just drop the write
 675                  * and return success.
 676                  */
 677                 if (error == ENOENT)
 678                         error = 0;
 679                 return (error);
 680         }
 681 
 682         offset = lr->lr_offset;
 683         length = lr->lr_length;
 684         eod = offset + length;  /* end of data for this write */
 685 
 686         /*
 687          * This may be a write from a dmu_sync() for a whole block,
 688          * and may extend beyond the current end of the file.
 689          * We can't just replay what was written for this TX_WRITE as
 690          * a future TX_WRITE2 may extend the eof and the data for that
 691          * write needs to be there. So we write the whole block and
 692          * reduce the eof. This needs to be done within the single dmu
 693          * transaction created within vn_rdwr -> zfs_write. So a possible
 694          * new end of file is passed through in zfsvfs->z_replay_eof
 695          */
 696 
 697         zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */
 698 
 699         /* If it's a dmu_sync() block, write the whole block */
 700         if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
 701                 uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
 702                 if (length < blocksize) {
 703                         offset -= offset % blocksize;
 704                         length = blocksize;
 705                 }
 706                 if (zp->z_size < eod)
 707                         zfsvfs->z_replay_eof = eod;
 708         }
 709 
 710         error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
 711             UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
 712 
 713         VN_RELE(ZTOV(zp));
 714         zfsvfs->z_replay_eof = 0;    /* safety */
 715 
 716         return (error);
 717 }
 718 
 719 /*
 720  * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
 721  * meaning the pool block is already being synced. So now that we always write
 722  * out full blocks, all we have to do is expand the eof if
 723  * the file is grown.
 724  */
 725 static int
 726 zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap)
 727 {
 728         zfsvfs_t *zfsvfs = arg1;
 729         lr_write_t *lr = arg2;
 730         znode_t *zp;
 731         int error;
 732         uint64_t end;
 733 
 734         if (byteswap)
 735                 byteswap_uint64_array(lr, sizeof (*lr));
 736 
 737         if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 738                 return (error);
 739 
 740 top:
 741         end = lr->lr_offset + lr->lr_length;
 742         if (end > zp->z_size) {
 743                 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
 744 
 745                 zp->z_size = end;
 746                 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 747                 error = dmu_tx_assign(tx, TXG_WAIT);
 748                 if (error) {
 749                         VN_RELE(ZTOV(zp));
 750                         if (error == ERESTART) {
 751                                 dmu_tx_wait(tx);
 752                                 dmu_tx_abort(tx);
 753                                 goto top;
 754                         }
 755                         dmu_tx_abort(tx);
 756                         return (error);
 757                 }
 758                 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 759                     (void *)&zp->z_size, sizeof (uint64_t), tx);
 760 
 761                 /* Ensure the replayed seq is updated */
 762                 (void) zil_replaying(zfsvfs->z_log, tx);
 763 
 764                 dmu_tx_commit(tx);
 765         }
 766 
 767         VN_RELE(ZTOV(zp));
 768 
 769         return (error);
 770 }
 771 
 772 static int
 773 zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap)
 774 {
 775         zfsvfs_t *zfsvfs = arg1;
 776         lr_truncate_t *lr = arg2;
 777         znode_t *zp;
 778         flock64_t fl;
 779         int error;
 780 
 781         if (byteswap)
 782                 byteswap_uint64_array(lr, sizeof (*lr));
 783 
 784         if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 785                 return (error);
 786 
 787         bzero(&fl, sizeof (fl));
 788         fl.l_type = F_WRLCK;
 789         fl.l_whence = 0;
 790         fl.l_start = lr->lr_offset;
 791         fl.l_len = lr->lr_length;
 792 
 793         error = VOP_SPACE(ZTOV(zp), F_FREESP, &fl, FWRITE | FOFFMAX,
 794             lr->lr_offset, kcred, NULL);
 795 
 796         VN_RELE(ZTOV(zp));
 797 
 798         return (error);
 799 }
 800 
 801 static int
 802 zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap)
 803 {
 804         zfsvfs_t *zfsvfs = arg1;
 805         lr_setattr_t *lr = arg2;
 806         znode_t *zp;
 807         xvattr_t xva;
 808         vattr_t *vap = &xva.xva_vattr;
 809         int error;
 810         void *start;
 811 
 812         xva_init(&xva);
 813         if (byteswap) {
 814                 byteswap_uint64_array(lr, sizeof (*lr));
 815 
 816                 if ((lr->lr_mask & AT_XVATTR) &&
 817                     zfsvfs->z_version >= ZPL_VERSION_INITIAL)
 818                         zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 819         }
 820 
 821         if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 822                 return (error);
 823 
 824         zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
 825             lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
 826 
 827         vap->va_size = lr->lr_size;
 828         ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime);
 829         ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime);
 830 
 831         /*
 832          * Fill in xvattr_t portions if necessary.
 833          */
 834 
 835         start = (lr_setattr_t *)(lr + 1);
 836         if (vap->va_mask & AT_XVATTR) {
 837                 zfs_replay_xvattr((lr_attr_t *)start, &xva);
 838                 start = (caddr_t)start +
 839                     ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize);
 840         } else
 841                 xva.xva_vattr.va_mask &= ~AT_XVATTR;
 842 
 843         zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start,
 844             lr->lr_uid, lr->lr_gid);
 845 
 846         error = VOP_SETATTR(ZTOV(zp), vap, 0, kcred, NULL);
 847 
 848         zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 849         zfsvfs->z_fuid_replay = NULL;
 850         VN_RELE(ZTOV(zp));
 851 
 852         return (error);
 853 }
 854 
 855 static int
 856 zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap)
 857 {
 858         zfsvfs_t *zfsvfs = arg1;
 859         lr_acl_v0_t *lr = arg2;
 860         ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */
 861         vsecattr_t vsa;
 862         znode_t *zp;
 863         int error;
 864 
 865         if (byteswap) {
 866                 byteswap_uint64_array(lr, sizeof (*lr));
 867                 zfs_oldace_byteswap(ace, lr->lr_aclcnt);
 868         }
 869 
 870         if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 871                 return (error);
 872 
 873         bzero(&vsa, sizeof (vsa));
 874         vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
 875         vsa.vsa_aclcnt = lr->lr_aclcnt;
 876         vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt;
 877         vsa.vsa_aclflags = 0;
 878         vsa.vsa_aclentp = ace;
 879 
 880         error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
 881 
 882         VN_RELE(ZTOV(zp));
 883 
 884         return (error);
 885 }
 886 
 887 /*
 888  * Replaying ACLs is complicated by FUID support.
 889  * The log record may contain some optional data
 890  * to be used for replaying FUID's.  These pieces
 891  * are the actual FUIDs that were created initially.
 892  * The FUID table index may no longer be valid and
 893  * during zfs_create() a new index may be assigned.
 894  * Because of this the log will contain the original
 895  * doman+rid in order to create a new FUID.
 896  *
 897  * The individual ACEs may contain an ephemeral uid/gid which is no
 898  * longer valid and will need to be replaced with an actual FUID.
 899  *
 900  */
 901 static int
 902 zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap)
 903 {
 904         zfsvfs_t *zfsvfs = arg1;
 905         lr_acl_t *lr = arg2;
 906         ace_t *ace = (ace_t *)(lr + 1);
 907         vsecattr_t vsa;
 908         znode_t *zp;
 909         int error;
 910 
 911         if (byteswap) {
 912                 byteswap_uint64_array(lr, sizeof (*lr));
 913                 zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE);
 914                 if (lr->lr_fuidcnt) {
 915                         byteswap_uint64_array((caddr_t)ace +
 916                             ZIL_ACE_LENGTH(lr->lr_acl_bytes),
 917                             lr->lr_fuidcnt * sizeof (uint64_t));
 918                 }
 919         }
 920 
 921         if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 922                 return (error);
 923 
 924         bzero(&vsa, sizeof (vsa));
 925         vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
 926         vsa.vsa_aclcnt = lr->lr_aclcnt;
 927         vsa.vsa_aclentp = ace;
 928         vsa.vsa_aclentsz = lr->lr_acl_bytes;
 929         vsa.vsa_aclflags = lr->lr_acl_flags;
 930 
 931         if (lr->lr_fuidcnt) {
 932                 void *fuidstart = (caddr_t)ace +
 933                     ZIL_ACE_LENGTH(lr->lr_acl_bytes);
 934 
 935                 zfsvfs->z_fuid_replay =
 936                     zfs_replay_fuids(fuidstart, &fuidstart,
 937                     lr->lr_fuidcnt, lr->lr_domcnt, 0, 0);
 938         }
 939 
 940         error = VOP_SETSECATTR(ZTOV(zp), &vsa, 0, kcred, NULL);
 941 
 942         if (zfsvfs->z_fuid_replay)
 943                 zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 944 
 945         zfsvfs->z_fuid_replay = NULL;
 946         VN_RELE(ZTOV(zp));
 947 
 948         return (error);
 949 }
 950 
 951 /*
 952  * Callback vectors for replaying records
 953  */
 954 zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 955         zfs_replay_error,       /* 0 no such transaction type */
 956         zfs_replay_create,      /* TX_CREATE */
 957         zfs_replay_create,      /* TX_MKDIR */
 958         zfs_replay_create,      /* TX_MKXATTR */
 959         zfs_replay_create,      /* TX_SYMLINK */
 960         zfs_replay_remove,      /* TX_REMOVE */
 961         zfs_replay_remove,      /* TX_RMDIR */
 962         zfs_replay_link,        /* TX_LINK */
 963         zfs_replay_rename,      /* TX_RENAME */
 964         zfs_replay_write,       /* TX_WRITE */
 965         zfs_replay_truncate,    /* TX_TRUNCATE */
 966         zfs_replay_setattr,     /* TX_SETATTR */
 967         zfs_replay_acl_v0,      /* TX_ACL_V0 */
 968         zfs_replay_acl,         /* TX_ACL */
 969         zfs_replay_create_acl,  /* TX_CREATE_ACL */
 970         zfs_replay_create,      /* TX_CREATE_ATTR */
 971         zfs_replay_create_acl,  /* TX_CREATE_ACL_ATTR */
 972         zfs_replay_create_acl,  /* TX_MKDIR_ACL */
 973         zfs_replay_create,      /* TX_MKDIR_ATTR */
 974         zfs_replay_create_acl,  /* TX_MKDIR_ACL_ATTR */
 975         zfs_replay_write2,      /* TX_WRITE2 */
 976 };