OS-4319 Wdiff usr/src/uts/common/fs/zfs/zfs_vnops.c

Print this page

OS-4319 zfs mishandles partial writes

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   * Copyright 2015 Joyent, Inc.
  27   27   * Copyright 2017 Nexenta Systems, Inc.
  28   28   */
  29   29  
  30   30  /* Portions Copyright 2007 Jeremy Teo */
  31   31  /* Portions Copyright 2010 Robert Milkowski */
  32   32  
  33   33  #include <sys/types.h>
  34   34  #include <sys/param.h>
  35   35  #include <sys/time.h>
  36   36  #include <sys/systm.h>
  37   37  #include <sys/sysmacros.h>
  38   38  #include <sys/resource.h>
  39   39  #include <sys/vfs.h>
  40   40  #include <sys/vfs_opreg.h>
  41   41  #include <sys/vnode.h>
  42   42  #include <sys/file.h>
  43   43  #include <sys/stat.h>
  44   44  #include <sys/kmem.h>
  45   45  #include <sys/taskq.h>
  46   46  #include <sys/uio.h>
  47   47  #include <sys/vmsystm.h>
  48   48  #include <sys/atomic.h>
  49   49  #include <sys/vm.h>
  50   50  #include <vm/seg_vn.h>
  51   51  #include <vm/pvn.h>
  52   52  #include <vm/as.h>
  53   53  #include <vm/kpm.h>
  54   54  #include <vm/seg_kpm.h>
  55   55  #include <sys/mman.h>
  56   56  #include <sys/pathname.h>
  57   57  #include <sys/cmn_err.h>
  58   58  #include <sys/errno.h>
  59   59  #include <sys/unistd.h>
  60   60  #include <sys/zfs_dir.h>
  61   61  #include <sys/zfs_acl.h>
  62   62  #include <sys/zfs_ioctl.h>
  63   63  #include <sys/fs/zfs.h>
  64   64  #include <sys/dmu.h>
  65   65  #include <sys/dmu_objset.h>
  66   66  #include <sys/spa.h>
  67   67  #include <sys/txg.h>
  68   68  #include <sys/dbuf.h>
  69   69  #include <sys/zap.h>
  70   70  #include <sys/sa.h>
  71   71  #include <sys/dirent.h>
  72   72  #include <sys/policy.h>
  73   73  #include <sys/sunddi.h>
  74   74  #include <sys/filio.h>
  75   75  #include <sys/sid.h>
  76   76  #include "fs/fs_subr.h"
  77   77  #include <sys/zfs_ctldir.h>
  78   78  #include <sys/zfs_fuid.h>
  79   79  #include <sys/zfs_sa.h>
  80   80  #include <sys/dnlc.h>
  81   81  #include <sys/zfs_rlock.h>
  82   82  #include <sys/extdirent.h>
  83   83  #include <sys/kidmap.h>
  84   84  #include <sys/cred.h>
  85   85  #include <sys/attr.h>
  86   86  #include <sys/zil.h>
  87   87  
  88   88  /*
  89   89   * Programming rules.
  90   90   *
  91   91   * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  92   92   * properly lock its in-core state, create a DMU transaction, do the work,
  93   93   * record this work in the intent log (ZIL), commit the DMU transaction,
  94   94   * and wait for the intent log to commit if it is a synchronous operation.
  95   95   * Moreover, the vnode ops must work in both normal and log replay context.
  96   96   * The ordering of events is important to avoid deadlocks and references
  97   97   * to freed memory.  The example below illustrates the following Big Rules:
  98   98   *
  99   99   *  (1) A check must be made in each zfs thread for a mounted file system.
 100  100   *      This is done avoiding races using ZFS_ENTER(zfsvfs).
 101  101   *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
 102  102   *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
 103  103   *      can return EIO from the calling function.
 104  104   *
 105  105   *  (2) VN_RELE() should always be the last thing except for zil_commit()
 106  106   *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 107  107   *      First, if it's the last reference, the vnode/znode
 108  108   *      can be freed, so the zp may point to freed memory.  Second, the last
 109  109   *      reference will call zfs_zinactive(), which may induce a lot of work --
 110  110   *      pushing cached pages (which acquires range locks) and syncing out
 111  111   *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 112  112   *      which could deadlock the system if you were already holding one.
 113  113   *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 114  114   *
 115  115   *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 116  116   *      as they can span dmu_tx_assign() calls.
 117  117   *
 118  118   *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 119  119   *      dmu_tx_assign().  This is critical because we don't want to block
 120  120   *      while holding locks.
 121  121   *
 122  122   *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 123  123   *      reduces lock contention and CPU usage when we must wait (note that if
 124  124   *      throughput is constrained by the storage, nearly every transaction
 125  125   *      must wait).
 126  126   *
 127  127   *      Note, in particular, that if a lock is sometimes acquired before
 128  128   *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 129  129   *      to use a non-blocking assign can deadlock the system.  The scenario:
 130  130   *
 131  131   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 132  132   *      Thread B is in an already-assigned tx, and blocks for this lock.
 133  133   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 134  134   *      forever, because the previous txg can't quiesce until B's tx commits.
 135  135   *
 136  136   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 137  137   *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 138  138   *      calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
 139  139   *      to indicate that this operation has already called dmu_tx_wait().
 140  140   *      This will ensure that we don't retry forever, waiting a short bit
 141  141   *      each time.
 142  142   *
 143  143   *  (5) If the operation succeeded, generate the intent log entry for it
 144  144   *      before dropping locks.  This ensures that the ordering of events
 145  145   *      in the intent log matches the order in which they actually occurred.
 146  146   *      During ZIL replay the zfs_log_* functions will update the sequence
 147  147   *      number to indicate the zil transaction has replayed.
 148  148   *
 149  149   *  (6) At the end of each vnode op, the DMU tx must always commit,
 150  150   *      regardless of whether there were any errors.
 151  151   *
 152  152   *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 153  153   *      to ensure that synchronous semantics are provided when necessary.
 154  154   *
 155  155   * In general, this is how things should be ordered in each vnode op:
 156  156   *
 157  157   *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 158  158   * top:
 159  159   *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 160  160   *      rw_enter(...);                  // grab any other locks you need
 161  161   *      tx = dmu_tx_create(...);        // get DMU tx
 162  162   *      dmu_tx_hold_*();                // hold each object you might modify
 163  163   *      error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
 164  164   *      if (error) {
 165  165   *              rw_exit(...);           // drop locks
 166  166   *              zfs_dirent_unlock(dl);  // unlock directory entry
 167  167   *              VN_RELE(...);           // release held vnodes
 168  168   *              if (error == ERESTART) {
 169  169   *                      waited = B_TRUE;
 170  170   *                      dmu_tx_wait(tx);
 171  171   *                      dmu_tx_abort(tx);
 172  172   *                      goto top;
 173  173   *              }
 174  174   *              dmu_tx_abort(tx);       // abort DMU tx
 175  175   *              ZFS_EXIT(zfsvfs);       // finished in zfs
 176  176   *              return (error);         // really out of space
 177  177   *      }
 178  178   *      error = do_real_work();         // do whatever this VOP does
 179  179   *      if (error == 0)
 180  180   *              zfs_log_*(...);         // on success, make ZIL entry
 181  181   *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 182  182   *      rw_exit(...);                   // drop locks
 183  183   *      zfs_dirent_unlock(dl);          // unlock directory entry
 184  184   *      VN_RELE(...);                   // release held vnodes
 185  185   *      zil_commit(zilog, foid);        // synchronous when necessary
 186  186   *      ZFS_EXIT(zfsvfs);               // finished in zfs
 187  187   *      return (error);                 // done, report error
 188  188   */
 189  189  
 190  190  /* ARGSUSED */
 191  191  static int
 192  192  zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 193  193  {
 194  194          znode_t *zp = VTOZ(*vpp);
 195  195          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 196  196  
 197  197          ZFS_ENTER(zfsvfs);
 198  198          ZFS_VERIFY_ZP(zp);
 199  199  
 200  200          if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 201  201              ((flag & FAPPEND) == 0)) {
 202  202                  ZFS_EXIT(zfsvfs);
 203  203                  return (SET_ERROR(EPERM));
 204  204          }
 205  205  
 206  206          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 207  207              ZTOV(zp)->v_type == VREG &&
 208  208              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 209  209                  if (fs_vscan(*vpp, cr, 0) != 0) {
 210  210                          ZFS_EXIT(zfsvfs);
 211  211                          return (SET_ERROR(EACCES));
 212  212                  }
 213  213          }
 214  214  
 215  215          /* Keep a count of the synchronous opens in the znode */
 216  216          if (flag & (FSYNC | FDSYNC))
 217  217                  atomic_inc_32(&zp->z_sync_cnt);
 218  218  
 219  219          ZFS_EXIT(zfsvfs);
 220  220          return (0);
 221  221  }
 222  222  
 223  223  /* ARGSUSED */
 224  224  static int
 225  225  zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 226  226      caller_context_t *ct)
 227  227  {
 228  228          znode_t *zp = VTOZ(vp);
 229  229          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 230  230  
 231  231          /*
 232  232           * Clean up any locks held by this process on the vp.
 233  233           */
 234  234          cleanlocks(vp, ddi_get_pid(), 0);
 235  235          cleanshares(vp, ddi_get_pid());
 236  236  
 237  237          ZFS_ENTER(zfsvfs);
 238  238          ZFS_VERIFY_ZP(zp);
 239  239  
 240  240          /* Decrement the synchronous opens in the znode */
 241  241          if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 242  242                  atomic_dec_32(&zp->z_sync_cnt);
 243  243  
 244  244          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 245  245              ZTOV(zp)->v_type == VREG &&
 246  246              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 247  247                  VERIFY(fs_vscan(vp, cr, 1) == 0);
 248  248  
 249  249          ZFS_EXIT(zfsvfs);
 250  250          return (0);
 251  251  }
 252  252  
 253  253  /*
 254  254   * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 255  255   * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 256  256   */
 257  257  static int
 258  258  zfs_holey(vnode_t *vp, int cmd, offset_t *off)
 259  259  {
 260  260          znode_t *zp = VTOZ(vp);
 261  261          uint64_t noff = (uint64_t)*off; /* new offset */
 262  262          uint64_t file_sz;
 263  263          int error;
 264  264          boolean_t hole;
 265  265  
 266  266          file_sz = zp->z_size;
 267  267          if (noff >= file_sz)  {
 268  268                  return (SET_ERROR(ENXIO));
 269  269          }
 270  270  
 271  271          if (cmd == _FIO_SEEK_HOLE)
 272  272                  hole = B_TRUE;
 273  273          else
 274  274                  hole = B_FALSE;
 275  275  
 276  276          error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 277  277  
 278  278          if (error == ESRCH)
 279  279                  return (SET_ERROR(ENXIO));
 280  280  
 281  281          /*
 282  282           * We could find a hole that begins after the logical end-of-file,
 283  283           * because dmu_offset_next() only works on whole blocks.  If the
 284  284           * EOF falls mid-block, then indicate that the "virtual hole"
 285  285           * at the end of the file begins at the logical EOF, rather than
 286  286           * at the end of the last block.
 287  287           */
 288  288          if (noff > file_sz) {
 289  289                  ASSERT(hole);
 290  290                  noff = file_sz;
 291  291          }
 292  292  
 293  293          if (noff < *off)
 294  294                  return (error);
 295  295          *off = noff;
 296  296          return (error);
 297  297  }
 298  298  
 299  299  /* ARGSUSED */
 300  300  static int
 301  301  zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 302  302      int *rvalp, caller_context_t *ct)
 303  303  {
 304  304          offset_t off;
 305  305          offset_t ndata;
 306  306          dmu_object_info_t doi;
 307  307          int error;
 308  308          zfsvfs_t *zfsvfs;
 309  309          znode_t *zp;
 310  310  
 311  311          switch (com) {
 312  312          case _FIOFFS:
 313  313          {
 314  314                  return (zfs_sync(vp->v_vfsp, 0, cred));
 315  315  
 316  316                  /*
 317  317                   * The following two ioctls are used by bfu.  Faking out,
 318  318                   * necessary to avoid bfu errors.
 319  319                   */
 320  320          }
 321  321          case _FIOGDIO:
 322  322          case _FIOSDIO:
 323  323          {
 324  324                  return (0);
 325  325          }
 326  326  
 327  327          case _FIO_SEEK_DATA:
 328  328          case _FIO_SEEK_HOLE:
 329  329          {
 330  330                  if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 331  331                          return (SET_ERROR(EFAULT));
 332  332  
 333  333                  zp = VTOZ(vp);
 334  334                  zfsvfs = zp->z_zfsvfs;
 335  335                  ZFS_ENTER(zfsvfs);
 336  336                  ZFS_VERIFY_ZP(zp);
 337  337  
 338  338                  /* offset parameter is in/out */
 339  339                  error = zfs_holey(vp, com, &off);
 340  340                  ZFS_EXIT(zfsvfs);
 341  341                  if (error)
 342  342                          return (error);
 343  343                  if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 344  344                          return (SET_ERROR(EFAULT));
 345  345                  return (0);
 346  346          }
 347  347          case _FIO_COUNT_FILLED:
 348  348          {
 349  349                  /*
 350  350                   * _FIO_COUNT_FILLED adds a new ioctl command which
 351  351                   * exposes the number of filled blocks in a
 352  352                   * ZFS object.
 353  353                   */
 354  354                  zp = VTOZ(vp);
 355  355                  zfsvfs = zp->z_zfsvfs;
 356  356                  ZFS_ENTER(zfsvfs);
 357  357                  ZFS_VERIFY_ZP(zp);
 358  358  
 359  359                  /*
 360  360                   * Wait for all dirty blocks for this object
 361  361                   * to get synced out to disk, and the DMU info
 362  362                   * updated.
 363  363                   */
 364  364                  error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 365  365                  if (error) {
 366  366                          ZFS_EXIT(zfsvfs);
 367  367                          return (error);
 368  368                  }
 369  369  
 370  370                  /*
 371  371                   * Retrieve fill count from DMU object.
 372  372                   */
 373  373                  error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 374  374                  if (error) {
 375  375                          ZFS_EXIT(zfsvfs);
 376  376                          return (error);
 377  377                  }
 378  378  
 379  379                  ndata = doi.doi_fill_count;
 380  380  
 381  381                  ZFS_EXIT(zfsvfs);
 382  382                  if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 383  383                          return (SET_ERROR(EFAULT));
 384  384                  return (0);
 385  385          }
 386  386          }
 387  387          return (SET_ERROR(ENOTTY));
 388  388  }
 389  389  
 390  390  /*
 391  391   * Utility functions to map and unmap a single physical page.  These
 392  392   * are used to manage the mappable copies of ZFS file data, and therefore
 393  393   * do not update ref/mod bits.
 394  394   */
 395  395  caddr_t
 396  396  zfs_map_page(page_t *pp, enum seg_rw rw)
 397  397  {
 398  398          if (kpm_enable)
 399  399                  return (hat_kpm_mapin(pp, 0));
 400  400          ASSERT(rw == S_READ || rw == S_WRITE);
 401  401          return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 402  402              (caddr_t)-1));
 403  403  }
 404  404  
 405  405  void
 406  406  zfs_unmap_page(page_t *pp, caddr_t addr)
 407  407  {
 408  408          if (kpm_enable) {
 409  409                  hat_kpm_mapout(pp, 0, addr);
 410  410          } else {
 411  411                  ppmapout(addr);
 412  412          }
 413  413  }
 414  414  
 415  415  /*
 416  416   * When a file is memory mapped, we must keep the IO data synchronized
 417  417   * between the DMU cache and the memory mapped pages.  What this means:
 418  418   *
 419  419   * On Write:    If we find a memory mapped page, we write to *both*
 420  420   *              the page and the dmu buffer.
 421  421   */
 422  422  static void
 423  423  update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 424  424  {
 425  425          int64_t off;
 426  426  
 427  427          off = start & PAGEOFFSET;
 428  428          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 429  429                  page_t *pp;
 430  430                  uint64_t nbytes = MIN(PAGESIZE - off, len);
 431  431  
 432  432                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 433  433                          caddr_t va;
 434  434  
 435  435                          va = zfs_map_page(pp, S_WRITE);
 436  436                          (void) dmu_read(os, oid, start+off, nbytes, va+off,
 437  437                              DMU_READ_PREFETCH);
 438  438                          zfs_unmap_page(pp, va);
 439  439                          page_unlock(pp);
 440  440                  }
 441  441                  len -= nbytes;
 442  442                  off = 0;
 443  443          }
 444  444  }
 445  445  
 446  446  /*
 447  447   * When a file is memory mapped, we must keep the IO data synchronized
 448  448   * between the DMU cache and the memory mapped pages.  What this means:
 449  449   *
 450  450   * On Read:     We "read" preferentially from memory mapped pages,
 451  451   *              else we default from the dmu buffer.
 452  452   *
 453  453   * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 454  454   *       the file is memory mapped.
 455  455   */
 456  456  static int
 457  457  mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 458  458  {
 459  459          znode_t *zp = VTOZ(vp);
 460  460          int64_t start, off;
 461  461          int len = nbytes;
 462  462          int error = 0;
 463  463  
 464  464          start = uio->uio_loffset;
 465  465          off = start & PAGEOFFSET;
 466  466          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 467  467                  page_t *pp;
 468  468                  uint64_t bytes = MIN(PAGESIZE - off, len);
 469  469  
 470  470                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 471  471                          caddr_t va;
 472  472  
 473  473                          va = zfs_map_page(pp, S_READ);
 474  474                          error = uiomove(va + off, bytes, UIO_READ, uio);
 475  475                          zfs_unmap_page(pp, va);
 476  476                          page_unlock(pp);
 477  477                  } else {
 478  478                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 479  479                              uio, bytes);
 480  480                  }
 481  481                  len -= bytes;
 482  482                  off = 0;
 483  483                  if (error)
 484  484                          break;
 485  485          }
 486  486          return (error);
 487  487  }
 488  488  
 489  489  offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 490  490  
 491  491  /*
 492  492   * Read bytes from specified file into supplied buffer.
 493  493   *
 494  494   *      IN:     vp      - vnode of file to be read from.
 495  495   *              uio     - structure supplying read location, range info,
 496  496   *                        and return buffer.
 497  497   *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 498  498   *              cr      - credentials of caller.
 499  499   *              ct      - caller context
 500  500   *
 501  501   *      OUT:    uio     - updated offset and range, buffer filled.
 502  502   *
 503  503   *      RETURN: 0 on success, error code on failure.
 504  504   *
 505  505   * Side Effects:
 506  506   *      vp - atime updated if byte count > 0
 507  507   */
 508  508  /* ARGSUSED */
 509  509  static int
 510  510  zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 511  511  {
 512  512          znode_t         *zp = VTOZ(vp);
 513  513          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 514  514          ssize_t         n, nbytes;
 515  515          int             error = 0;
 516  516          xuio_t          *xuio = NULL;
 517  517  
 518  518          ZFS_ENTER(zfsvfs);
 519  519          ZFS_VERIFY_ZP(zp);
 520  520  
 521  521          if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 522  522                  ZFS_EXIT(zfsvfs);
 523  523                  return (SET_ERROR(EACCES));
 524  524          }
 525  525  
 526  526          /*
 527  527           * Validate file offset
 528  528           */
 529  529          if (uio->uio_loffset < (offset_t)0) {
 530  530                  ZFS_EXIT(zfsvfs);
 531  531                  return (SET_ERROR(EINVAL));
 532  532          }
 533  533  
 534  534          /*
 535  535           * Fasttrack empty reads
 536  536           */
 537  537          if (uio->uio_resid == 0) {
 538  538                  ZFS_EXIT(zfsvfs);
 539  539                  return (0);
 540  540          }
 541  541  
 542  542          /*
 543  543           * Check for mandatory locks
 544  544           */
 545  545          if (MANDMODE(zp->z_mode)) {
 546  546                  if (error = chklock(vp, FREAD,
 547  547                      uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 548  548                          ZFS_EXIT(zfsvfs);
 549  549                          return (error);
 550  550                  }
 551  551          }
 552  552  
 553  553          /*
 554  554           * If we're in FRSYNC mode, sync out this znode before reading it.
 555  555           */
 556  556          if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 557  557                  zil_commit(zfsvfs->z_log, zp->z_id);
 558  558  
 559  559          /*
 560  560           * Lock the range against changes.
 561  561           */
 562  562          locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
 563  563              uio->uio_loffset, uio->uio_resid, RL_READER);
 564  564  
 565  565          /*
 566  566           * If we are reading past end-of-file we can skip
 567  567           * to the end; but we might still need to set atime.
 568  568           */
 569  569          if (uio->uio_loffset >= zp->z_size) {
 570  570                  error = 0;
 571  571                  goto out;
 572  572          }
 573  573  
 574  574          ASSERT(uio->uio_loffset < zp->z_size);
 575  575          n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 576  576  
 577  577          if ((uio->uio_extflg == UIO_XUIO) &&
 578  578              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 579  579                  int nblk;
 580  580                  int blksz = zp->z_blksz;
 581  581                  uint64_t offset = uio->uio_loffset;
 582  582  
 583  583                  xuio = (xuio_t *)uio;
 584  584                  if ((ISP2(blksz))) {
 585  585                          nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 586  586                              blksz)) / blksz;
 587  587                  } else {
 588  588                          ASSERT(offset + n <= blksz);
 589  589                          nblk = 1;
 590  590                  }
 591  591                  (void) dmu_xuio_init(xuio, nblk);
 592  592  
 593  593                  if (vn_has_cached_data(vp)) {
 594  594                          /*
 595  595                           * For simplicity, we always allocate a full buffer
 596  596                           * even if we only expect to read a portion of a block.
 597  597                           */
 598  598                          while (--nblk >= 0) {
 599  599                                  (void) dmu_xuio_add(xuio,
 600  600                                      dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 601  601                                      blksz), 0, blksz);
 602  602                          }
 603  603                  }
 604  604          }
 605  605  
 606  606          while (n > 0) {
 607  607                  nbytes = MIN(n, zfs_read_chunk_size -
 608  608                      P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 609  609  
 610  610                  if (vn_has_cached_data(vp)) {
 611  611                          error = mappedread(vp, nbytes, uio);
 612  612                  } else {
 613  613                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 614  614                              uio, nbytes);
 615  615                  }
 616  616                  if (error) {
 617  617                          /* convert checksum errors into IO errors */
 618  618                          if (error == ECKSUM)
 619  619                                  error = SET_ERROR(EIO);
 620  620                          break;
 621  621                  }
 622  622  
 623  623                  n -= nbytes;
 624  624          }
 625  625  out:
 626  626          rangelock_exit(lr);
 627  627  
 628  628          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 629  629          ZFS_EXIT(zfsvfs);
 630  630          return (error);
 631  631  }
 632  632  
 633  633  /*
 634  634   * Write the bytes to a file.
 635  635   *
 636  636   *      IN:     vp      - vnode of file to be written to.
 637  637   *              uio     - structure supplying write location, range info,
 638  638   *                        and data buffer.
 639  639   *              ioflag  - FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
 640  640   *                        set if in append mode.
 641  641   *              cr      - credentials of caller.
 642  642   *              ct      - caller context (NFS/CIFS fem monitor only)
 643  643   *
 644  644   *      OUT:    uio     - updated offset and range.
 645  645   *
 646  646   *      RETURN: 0 on success, error code on failure.
 647  647   *
 648  648   * Timestamps:
 649  649   *      vp - ctime|mtime updated if byte count > 0
 650  650   */
 651  651  
 652  652  /* ARGSUSED */
 653  653  static int
 654  654  zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 655  655  {
 656  656          znode_t         *zp = VTOZ(vp);
 657  657          rlim64_t        limit = uio->uio_llimit;

↓ open down ↓

657 lines elided

↑ open up ↑

 658  658          ssize_t         start_resid = uio->uio_resid;
 659  659          ssize_t         tx_bytes;
 660  660          uint64_t        end_size;
 661  661          dmu_tx_t        *tx;
 662  662          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 663  663          zilog_t         *zilog;
 664  664          offset_t        woff;
 665  665          ssize_t         n, nbytes;
 666  666          int             max_blksz = zfsvfs->z_max_blksz;
 667  667          int             error = 0;
      668 +        int             prev_error;
 668  669          arc_buf_t       *abuf;
 669  670          iovec_t         *aiov = NULL;
 670  671          xuio_t          *xuio = NULL;
 671  672          int             i_iov = 0;
 672  673          int             iovcnt = uio->uio_iovcnt;
 673  674          iovec_t         *iovp = uio->uio_iov;
 674  675          int             write_eof;
 675  676          int             count = 0;
 676  677          sa_bulk_attr_t  bulk[4];
 677  678          uint64_t        mtime[2], ctime[2];

 678  679  
 679  680          /*
 680  681           * Fasttrack empty write
 681  682           */
 682  683          n = start_resid;
 683  684          if (n == 0)
 684  685                  return (0);
 685  686  
 686  687          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 687  688                  limit = MAXOFFSET_T;
 688  689  
 689  690          ZFS_ENTER(zfsvfs);
 690  691          ZFS_VERIFY_ZP(zp);
 691  692  
 692  693          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 693  694          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 694  695          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 695  696              &zp->z_size, 8);
 696  697          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 697  698              &zp->z_pflags, 8);
 698  699  
 699  700          /*
 700  701           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 701  702           * callers might not be able to detect properly that we are read-only,
 702  703           * so check it explicitly here.
 703  704           */
 704  705          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 705  706                  ZFS_EXIT(zfsvfs);
 706  707                  return (SET_ERROR(EROFS));
 707  708          }
 708  709  
 709  710          /*
 710  711           * If immutable or not appending then return EPERM.
 711  712           * Intentionally allow ZFS_READONLY through here.
 712  713           * See zfs_zaccess_common()
 713  714           */
 714  715          if ((zp->z_pflags & ZFS_IMMUTABLE) ||
 715  716              ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 716  717              (uio->uio_loffset < zp->z_size))) {
 717  718                  ZFS_EXIT(zfsvfs);
 718  719                  return (SET_ERROR(EPERM));
 719  720          }
 720  721  
 721  722          zilog = zfsvfs->z_log;
 722  723  
 723  724          /*
 724  725           * Validate file offset
 725  726           */
 726  727          woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 727  728          if (woff < 0) {
 728  729                  ZFS_EXIT(zfsvfs);
 729  730                  return (SET_ERROR(EINVAL));
 730  731          }
 731  732  
 732  733          /*
 733  734           * Check for mandatory locks before calling rangelock_enter()
 734  735           * in order to prevent a deadlock with locks set via fcntl().
 735  736           */
 736  737          if (MANDMODE((mode_t)zp->z_mode) &&
 737  738              (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 738  739                  ZFS_EXIT(zfsvfs);
 739  740                  return (error);
 740  741          }
 741  742  
 742  743          /*
 743  744           * Pre-fault the pages to ensure slow (eg NFS) pages
 744  745           * don't hold up txg.
 745  746           * Skip this if uio contains loaned arc_buf.
 746  747           */
 747  748          if ((uio->uio_extflg == UIO_XUIO) &&
 748  749              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 749  750                  xuio = (xuio_t *)uio;
 750  751          else
 751  752                  uio_prefaultpages(MIN(n, max_blksz), uio);
 752  753  
 753  754          /*
 754  755           * If in append mode, set the io offset pointer to eof.
 755  756           */
 756  757          locked_range_t *lr;
 757  758          if (ioflag & FAPPEND) {
 758  759                  /*
 759  760                   * Obtain an appending range lock to guarantee file append
 760  761                   * semantics.  We reset the write offset once we have the lock.
 761  762                   */
 762  763                  lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
 763  764                  woff = lr->lr_offset;
 764  765                  if (lr->lr_length == UINT64_MAX) {
 765  766                          /*
 766  767                           * We overlocked the file because this write will cause
 767  768                           * the file block size to increase.
 768  769                           * Note that zp_size cannot change with this lock held.
 769  770                           */
 770  771                          woff = zp->z_size;
 771  772                  }
 772  773                  uio->uio_loffset = woff;
 773  774          } else {
 774  775                  /*
 775  776                   * Note that if the file block size will change as a result of
 776  777                   * this write, then this range lock will lock the entire file
 777  778                   * so that we can re-write the block safely.
 778  779                   */
 779  780                  lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
 780  781          }
 781  782  
 782  783          if (woff >= limit) {
 783  784                  rangelock_exit(lr);
 784  785                  ZFS_EXIT(zfsvfs);
 785  786                  return (SET_ERROR(EFBIG));
 786  787          }
 787  788  
 788  789          if ((woff + n) > limit || woff > (limit - n))
 789  790                  n = limit - woff;
 790  791  
 791  792          /* Will this write extend the file length? */
 792  793          write_eof = (woff + n > zp->z_size);
 793  794  
 794  795          end_size = MAX(zp->z_size, woff + n);
 795  796  
 796  797          /*
 797  798           * Write the file in reasonable size chunks.  Each chunk is written
 798  799           * in a separate transaction; this keeps the intent log records small
 799  800           * and allows us to do more fine-grained space accounting.
 800  801           */
 801  802          while (n > 0) {
 802  803                  abuf = NULL;
 803  804                  woff = uio->uio_loffset;
 804  805                  if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 805  806                      zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 806  807                          if (abuf != NULL)
 807  808                                  dmu_return_arcbuf(abuf);
 808  809                          error = SET_ERROR(EDQUOT);
 809  810                          break;
 810  811                  }
 811  812  
 812  813                  if (xuio && abuf == NULL) {
 813  814                          ASSERT(i_iov < iovcnt);
 814  815                          aiov = &iovp[i_iov];
 815  816                          abuf = dmu_xuio_arcbuf(xuio, i_iov);
 816  817                          dmu_xuio_clear(xuio, i_iov);
 817  818                          DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 818  819                              iovec_t *, aiov, arc_buf_t *, abuf);
 819  820                          ASSERT((aiov->iov_base == abuf->b_data) ||
 820  821                              ((char *)aiov->iov_base - (char *)abuf->b_data +
 821  822                              aiov->iov_len == arc_buf_size(abuf)));
 822  823                          i_iov++;
 823  824                  } else if (abuf == NULL && n >= max_blksz &&
 824  825                      woff >= zp->z_size &&
 825  826                      P2PHASE(woff, max_blksz) == 0 &&
 826  827                      zp->z_blksz == max_blksz) {
 827  828                          /*
 828  829                           * This write covers a full block.  "Borrow" a buffer
 829  830                           * from the dmu so that we can fill it before we enter
 830  831                           * a transaction.  This avoids the possibility of
 831  832                           * holding up the transaction if the data copy hangs
 832  833                           * up on a pagefault (e.g., from an NFS server mapping).
 833  834                           */
 834  835                          size_t cbytes;
 835  836  
 836  837                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 837  838                              max_blksz);
 838  839                          ASSERT(abuf != NULL);
 839  840                          ASSERT(arc_buf_size(abuf) == max_blksz);
 840  841                          if (error = uiocopy(abuf->b_data, max_blksz,
 841  842                              UIO_WRITE, uio, &cbytes)) {
 842  843                                  dmu_return_arcbuf(abuf);
 843  844                                  break;
 844  845                          }
 845  846                          ASSERT(cbytes == max_blksz);
 846  847                  }
 847  848  
 848  849                  /*
 849  850                   * Start a transaction.
 850  851                   */
 851  852                  tx = dmu_tx_create(zfsvfs->z_os);
 852  853                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 853  854                  dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 854  855                  zfs_sa_upgrade_txholds(tx, zp);
 855  856                  error = dmu_tx_assign(tx, TXG_WAIT);
 856  857                  if (error) {
 857  858                          dmu_tx_abort(tx);
 858  859                          if (abuf != NULL)
 859  860                                  dmu_return_arcbuf(abuf);
 860  861                          break;
 861  862                  }
 862  863  
 863  864                  /*
 864  865                   * If rangelock_enter() over-locked we grow the blocksize
 865  866                   * and then reduce the lock range.  This will only happen
 866  867                   * on the first iteration since rangelock_reduce() will
 867  868                   * shrink down lr_length to the appropriate size.
 868  869                   */
 869  870                  if (lr->lr_length == UINT64_MAX) {
 870  871                          uint64_t new_blksz;
 871  872  
 872  873                          if (zp->z_blksz > max_blksz) {
 873  874                                  /*
 874  875                                   * File's blocksize is already larger than the
 875  876                                   * "recordsize" property.  Only let it grow to
 876  877                                   * the next power of 2.
 877  878                                   */
 878  879                                  ASSERT(!ISP2(zp->z_blksz));
 879  880                                  new_blksz = MIN(end_size,
 880  881                                      1 << highbit64(zp->z_blksz));
 881  882                          } else {
 882  883                                  new_blksz = MIN(end_size, max_blksz);
 883  884                          }
 884  885                          zfs_grow_blocksize(zp, new_blksz, tx);
 885  886                          rangelock_reduce(lr, woff, n);
 886  887                  }
 887  888  
 888  889                  /*
 889  890                   * XXX - should we really limit each write to z_max_blksz?
 890  891                   * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 891  892                   */
 892  893                  nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 893  894  
 894  895                  if (abuf == NULL) {
 895  896                          tx_bytes = uio->uio_resid;
 896  897                          error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 897  898                              uio, nbytes, tx);
 898  899                          tx_bytes -= uio->uio_resid;
 899  900                  } else {
 900  901                          tx_bytes = nbytes;
 901  902                          ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 902  903                          /*
 903  904                           * If this is not a full block write, but we are
 904  905                           * extending the file past EOF and this data starts
 905  906                           * block-aligned, use assign_arcbuf().  Otherwise,
 906  907                           * write via dmu_write().
 907  908                           */
 908  909                          if (tx_bytes < max_blksz && (!write_eof ||
 909  910                              aiov->iov_base != abuf->b_data)) {
 910  911                                  ASSERT(xuio);
 911  912                                  dmu_write(zfsvfs->z_os, zp->z_id, woff,
 912  913                                      aiov->iov_len, aiov->iov_base, tx);
 913  914                                  dmu_return_arcbuf(abuf);
 914  915                                  xuio_stat_wbuf_copied();
 915  916                          } else {
 916  917                                  ASSERT(xuio || tx_bytes == max_blksz);
 917  918                                  dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 918  919                                      woff, abuf, tx);
 919  920                          }
 920  921                          ASSERT(tx_bytes <= uio->uio_resid);
 921  922                          uioskip(uio, tx_bytes);
 922  923                  }
 923  924                  if (tx_bytes && vn_has_cached_data(vp)) {
 924  925                          update_pages(vp, woff,
 925  926                              tx_bytes, zfsvfs->z_os, zp->z_id);
 926  927                  }
 927  928  
 928  929                  /*
 929  930                   * If we made no progress, we're done.  If we made even
 930  931                   * partial progress, update the znode and ZIL accordingly.
 931  932                   */
 932  933                  if (tx_bytes == 0) {
 933  934                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 934  935                              (void *)&zp->z_size, sizeof (uint64_t), tx);
 935  936                          dmu_tx_commit(tx);
 936  937                          ASSERT(error != 0);
 937  938                          break;
 938  939                  }
 939  940  
 940  941                  /*
 941  942                   * Clear Set-UID/Set-GID bits on successful write if not
 942  943                   * privileged and at least one of the excute bits is set.
 943  944                   *
 944  945                   * It would be nice to to this after all writes have
 945  946                   * been done, but that would still expose the ISUID/ISGID
 946  947                   * to another app after the partial write is committed.
 947  948                   *
 948  949                   * Note: we don't call zfs_fuid_map_id() here because
 949  950                   * user 0 is not an ephemeral uid.
 950  951                   */
 951  952                  mutex_enter(&zp->z_acl_lock);
 952  953                  if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 953  954                      (S_IXUSR >> 6))) != 0 &&
 954  955                      (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 955  956                      secpolicy_vnode_setid_retain(cr,
 956  957                      (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 957  958                          uint64_t newmode;
 958  959                          zp->z_mode &= ~(S_ISUID | S_ISGID);
 959  960                          newmode = zp->z_mode;
 960  961                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 961  962                              (void *)&newmode, sizeof (uint64_t), tx);
 962  963                  }
 963  964                  mutex_exit(&zp->z_acl_lock);
 964  965

↓ open down ↓

287 lines elided

↑ open up ↑

 965  966                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 966  967                      B_TRUE);
 967  968  
 968  969                  /*
 969  970                   * Update the file size (zp_size) if it has changed;
 970  971                   * account for possible concurrent updates.
 971  972                   */
 972  973                  while ((end_size = zp->z_size) < uio->uio_loffset) {
 973  974                          (void) atomic_cas_64(&zp->z_size, end_size,
 974  975                              uio->uio_loffset);
 975      -                        ASSERT(error == 0);
 976  976                  }
 977  977                  /*
 978  978                   * If we are replaying and eof is non zero then force
 979  979                   * the file size to the specified eof. Note, there's no
 980  980                   * concurrency during replay.
 981  981                   */
 982  982                  if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 983  983                          zp->z_size = zfsvfs->z_replay_eof;
 984  984  
      985 +                /*
      986 +                 * Keep track of a possible pre-existing error from a partial
      987 +                 * write via dmu_write_uio_dbuf above.
      988 +                 */
      989 +                prev_error = error;
 985  990                  error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 986  991  
 987  992                  zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 988  993                  dmu_tx_commit(tx);
 989  994  
 990      -                if (error != 0)
      995 +                if (prev_error != 0 || error != 0)
 991  996                          break;
 992  997                  ASSERT(tx_bytes == nbytes);
 993  998                  n -= nbytes;
 994  999  
 995 1000                  if (!xuio && n > 0)
 996 1001                          uio_prefaultpages(MIN(n, max_blksz), uio);
 997 1002          }
 998 1003  
 999 1004          rangelock_exit(lr);
1000 1005

1001 1006          /*
1002 1007           * If we're in replay mode, or we made no progress, return error.
1003 1008           * Otherwise, it's at least a partial write, so it's successful.
1004 1009           */
1005 1010          if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1006 1011                  ZFS_EXIT(zfsvfs);
1007 1012                  return (error);
1008 1013          }
1009 1014  
1010 1015          if (ioflag & (FSYNC | FDSYNC) ||
1011 1016              zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1012 1017                  zil_commit(zilog, zp->z_id);
1013 1018  
1014 1019          ZFS_EXIT(zfsvfs);
1015 1020          return (0);
1016 1021  }
1017 1022  
1018 1023  /* ARGSUSED */
1019 1024  void
1020 1025  zfs_get_done(zgd_t *zgd, int error)
1021 1026  {
1022 1027          znode_t *zp = zgd->zgd_private;
1023 1028          objset_t *os = zp->z_zfsvfs->z_os;
1024 1029  
1025 1030          if (zgd->zgd_db)
1026 1031                  dmu_buf_rele(zgd->zgd_db, zgd);
1027 1032  
1028 1033          rangelock_exit(zgd->zgd_lr);
1029 1034  
1030 1035          /*
1031 1036           * Release the vnode asynchronously as we currently have the
1032 1037           * txg stopped from syncing.
1033 1038           */
1034 1039          VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1035 1040  
1036 1041          kmem_free(zgd, sizeof (zgd_t));
1037 1042  }
1038 1043  
1039 1044  #ifdef DEBUG
1040 1045  static int zil_fault_io = 0;
1041 1046  #endif
1042 1047  
1043 1048  /*
1044 1049   * Get data to generate a TX_WRITE intent log record.
1045 1050   */
1046 1051  int
1047 1052  zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1048 1053  {
1049 1054          zfsvfs_t *zfsvfs = arg;
1050 1055          objset_t *os = zfsvfs->z_os;
1051 1056          znode_t *zp;
1052 1057          uint64_t object = lr->lr_foid;
1053 1058          uint64_t offset = lr->lr_offset;
1054 1059          uint64_t size = lr->lr_length;
1055 1060          dmu_buf_t *db;
1056 1061          zgd_t *zgd;
1057 1062          int error = 0;
1058 1063  
1059 1064          ASSERT3P(lwb, !=, NULL);
1060 1065          ASSERT3P(zio, !=, NULL);
1061 1066          ASSERT3U(size, !=, 0);
1062 1067  
1063 1068          /*
1064 1069           * Nothing to do if the file has been removed
1065 1070           */
1066 1071          if (zfs_zget(zfsvfs, object, &zp) != 0)
1067 1072                  return (SET_ERROR(ENOENT));
1068 1073          if (zp->z_unlinked) {
1069 1074                  /*
1070 1075                   * Release the vnode asynchronously as we currently have the
1071 1076                   * txg stopped from syncing.
1072 1077                   */
1073 1078                  VN_RELE_ASYNC(ZTOV(zp),
1074 1079                      dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1075 1080                  return (SET_ERROR(ENOENT));
1076 1081          }
1077 1082  
1078 1083          zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1079 1084          zgd->zgd_lwb = lwb;
1080 1085          zgd->zgd_private = zp;
1081 1086  
1082 1087          /*
1083 1088           * Write records come in two flavors: immediate and indirect.
1084 1089           * For small writes it's cheaper to store the data with the
1085 1090           * log record (immediate); for large writes it's cheaper to
1086 1091           * sync the data and get a pointer to it (indirect) so that
1087 1092           * we don't have to write the data twice.
1088 1093           */
1089 1094          if (buf != NULL) { /* immediate write */
1090 1095                  zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1091 1096                      offset, size, RL_READER);
1092 1097                  /* test for truncation needs to be done while range locked */
1093 1098                  if (offset >= zp->z_size) {
1094 1099                          error = SET_ERROR(ENOENT);
1095 1100                  } else {
1096 1101                          error = dmu_read(os, object, offset, size, buf,
1097 1102                              DMU_READ_NO_PREFETCH);
1098 1103                  }
1099 1104                  ASSERT(error == 0 || error == ENOENT);
1100 1105          } else { /* indirect write */
1101 1106                  /*
1102 1107                   * Have to lock the whole block to ensure when it's
1103 1108                   * written out and its checksum is being calculated
1104 1109                   * that no one can change the data. We need to re-check
1105 1110                   * blocksize after we get the lock in case it's changed!
1106 1111                   */
1107 1112                  for (;;) {
1108 1113                          uint64_t blkoff;
1109 1114                          size = zp->z_blksz;
1110 1115                          blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1111 1116                          offset -= blkoff;
1112 1117                          zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1113 1118                              offset, size, RL_READER);
1114 1119                          if (zp->z_blksz == size)
1115 1120                                  break;
1116 1121                          offset += blkoff;
1117 1122                          rangelock_exit(zgd->zgd_lr);
1118 1123                  }
1119 1124                  /* test for truncation needs to be done while range locked */
1120 1125                  if (lr->lr_offset >= zp->z_size)
1121 1126                          error = SET_ERROR(ENOENT);
1122 1127  #ifdef DEBUG
1123 1128                  if (zil_fault_io) {
1124 1129                          error = SET_ERROR(EIO);
1125 1130                          zil_fault_io = 0;
1126 1131                  }
1127 1132  #endif
1128 1133                  if (error == 0)
1129 1134                          error = dmu_buf_hold(os, object, offset, zgd, &db,
1130 1135                              DMU_READ_NO_PREFETCH);
1131 1136  
1132 1137                  if (error == 0) {
1133 1138                          blkptr_t *bp = &lr->lr_blkptr;
1134 1139  
1135 1140                          zgd->zgd_db = db;
1136 1141                          zgd->zgd_bp = bp;
1137 1142  
1138 1143                          ASSERT(db->db_offset == offset);
1139 1144                          ASSERT(db->db_size == size);
1140 1145  
1141 1146                          error = dmu_sync(zio, lr->lr_common.lrc_txg,
1142 1147                              zfs_get_done, zgd);
1143 1148                          ASSERT(error || lr->lr_length <= size);
1144 1149  
1145 1150                          /*
1146 1151                           * On success, we need to wait for the write I/O
1147 1152                           * initiated by dmu_sync() to complete before we can
1148 1153                           * release this dbuf.  We will finish everything up
1149 1154                           * in the zfs_get_done() callback.
1150 1155                           */
1151 1156                          if (error == 0)
1152 1157                                  return (0);
1153 1158  
1154 1159                          if (error == EALREADY) {
1155 1160                                  lr->lr_common.lrc_txtype = TX_WRITE2;
1156 1161                                  /*
1157 1162                                   * TX_WRITE2 relies on the data previously
1158 1163                                   * written by the TX_WRITE that caused
1159 1164                                   * EALREADY.  We zero out the BP because
1160 1165                                   * it is the old, currently-on-disk BP.
1161 1166                                   */
1162 1167                                  zgd->zgd_bp = NULL;
1163 1168                                  BP_ZERO(bp);
1164 1169                                  error = 0;
1165 1170                          }
1166 1171                  }
1167 1172          }
1168 1173  
1169 1174          zfs_get_done(zgd, error);
1170 1175  
1171 1176          return (error);
1172 1177  }
1173 1178  
1174 1179  /*ARGSUSED*/
1175 1180  static int
1176 1181  zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1177 1182      caller_context_t *ct)
1178 1183  {
1179 1184          znode_t *zp = VTOZ(vp);
1180 1185          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1181 1186          int error;
1182 1187  
1183 1188          ZFS_ENTER(zfsvfs);
1184 1189          ZFS_VERIFY_ZP(zp);
1185 1190  
1186 1191          if (flag & V_ACE_MASK)
1187 1192                  error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1188 1193          else
1189 1194                  error = zfs_zaccess_rwx(zp, mode, flag, cr);
1190 1195  
1191 1196          ZFS_EXIT(zfsvfs);
1192 1197          return (error);
1193 1198  }
1194 1199  
1195 1200  /*
1196 1201   * If vnode is for a device return a specfs vnode instead.
1197 1202   */
1198 1203  static int
1199 1204  specvp_check(vnode_t **vpp, cred_t *cr)
1200 1205  {
1201 1206          int error = 0;
1202 1207  
1203 1208          if (IS_DEVVP(*vpp)) {
1204 1209                  struct vnode *svp;
1205 1210  
1206 1211                  svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1207 1212                  VN_RELE(*vpp);
1208 1213                  if (svp == NULL)
1209 1214                          error = SET_ERROR(ENOSYS);
1210 1215                  *vpp = svp;
1211 1216          }
1212 1217          return (error);
1213 1218  }
1214 1219  
1215 1220  
1216 1221  /*
1217 1222   * Lookup an entry in a directory, or an extended attribute directory.
1218 1223   * If it exists, return a held vnode reference for it.
1219 1224   *
1220 1225   *      IN:     dvp     - vnode of directory to search.
1221 1226   *              nm      - name of entry to lookup.
1222 1227   *              pnp     - full pathname to lookup [UNUSED].
1223 1228   *              flags   - LOOKUP_XATTR set if looking for an attribute.
1224 1229   *              rdir    - root directory vnode [UNUSED].
1225 1230   *              cr      - credentials of caller.
1226 1231   *              ct      - caller context
1227 1232   *              direntflags - directory lookup flags
1228 1233   *              realpnp - returned pathname.
1229 1234   *
1230 1235   *      OUT:    vpp     - vnode of located entry, NULL if not found.
1231 1236   *
1232 1237   *      RETURN: 0 on success, error code on failure.
1233 1238   *
1234 1239   * Timestamps:
1235 1240   *      NA
1236 1241   */
1237 1242  /* ARGSUSED */
1238 1243  static int
1239 1244  zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1240 1245      int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1241 1246      int *direntflags, pathname_t *realpnp)
1242 1247  {
1243 1248          znode_t *zdp = VTOZ(dvp);
1244 1249          zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1245 1250          int     error = 0;
1246 1251  
1247 1252          /*
1248 1253           * Fast path lookup, however we must skip DNLC lookup
1249 1254           * for case folding or normalizing lookups because the
1250 1255           * DNLC code only stores the passed in name.  This means
1251 1256           * creating 'a' and removing 'A' on a case insensitive
1252 1257           * file system would work, but DNLC still thinks 'a'
1253 1258           * exists and won't let you create it again on the next
1254 1259           * pass through fast path.
1255 1260           */
1256 1261          if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1257 1262  
1258 1263                  if (dvp->v_type != VDIR) {
1259 1264                          return (SET_ERROR(ENOTDIR));
1260 1265                  } else if (zdp->z_sa_hdl == NULL) {
1261 1266                          return (SET_ERROR(EIO));
1262 1267                  }
1263 1268  
1264 1269                  if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1265 1270                          error = zfs_fastaccesschk_execute(zdp, cr);
1266 1271                          if (!error) {
1267 1272                                  *vpp = dvp;
1268 1273                                  VN_HOLD(*vpp);
1269 1274                                  return (0);
1270 1275                          }
1271 1276                          return (error);
1272 1277                  } else if (!zdp->z_zfsvfs->z_norm &&
1273 1278                      (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
1274 1279  
1275 1280                          vnode_t *tvp = dnlc_lookup(dvp, nm);
1276 1281  
1277 1282                          if (tvp) {
1278 1283                                  error = zfs_fastaccesschk_execute(zdp, cr);
1279 1284                                  if (error) {
1280 1285                                          VN_RELE(tvp);
1281 1286                                          return (error);
1282 1287                                  }
1283 1288                                  if (tvp == DNLC_NO_VNODE) {
1284 1289                                          VN_RELE(tvp);
1285 1290                                          return (SET_ERROR(ENOENT));
1286 1291                                  } else {
1287 1292                                          *vpp = tvp;
1288 1293                                          return (specvp_check(vpp, cr));
1289 1294                                  }
1290 1295                          }
1291 1296                  }
1292 1297          }
1293 1298  
1294 1299          DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1295 1300  
1296 1301          ZFS_ENTER(zfsvfs);
1297 1302          ZFS_VERIFY_ZP(zdp);
1298 1303  
1299 1304          *vpp = NULL;
1300 1305  
1301 1306          if (flags & LOOKUP_XATTR) {
1302 1307                  /*
1303 1308                   * If the xattr property is off, refuse the lookup request.
1304 1309                   */
1305 1310                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1306 1311                          ZFS_EXIT(zfsvfs);
1307 1312                          return (SET_ERROR(EINVAL));
1308 1313                  }
1309 1314  
1310 1315                  /*
1311 1316                   * We don't allow recursive attributes..
1312 1317                   * Maybe someday we will.
1313 1318                   */
1314 1319                  if (zdp->z_pflags & ZFS_XATTR) {
1315 1320                          ZFS_EXIT(zfsvfs);
1316 1321                          return (SET_ERROR(EINVAL));
1317 1322                  }
1318 1323  
1319 1324                  if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1320 1325                          ZFS_EXIT(zfsvfs);
1321 1326                          return (error);
1322 1327                  }
1323 1328  
1324 1329                  /*
1325 1330                   * Do we have permission to get into attribute directory?
1326 1331                   */
1327 1332  
1328 1333                  if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1329 1334                      B_FALSE, cr)) {
1330 1335                          VN_RELE(*vpp);
1331 1336                          *vpp = NULL;
1332 1337                  }
1333 1338  
1334 1339                  ZFS_EXIT(zfsvfs);
1335 1340                  return (error);
1336 1341          }
1337 1342  
1338 1343          if (dvp->v_type != VDIR) {
1339 1344                  ZFS_EXIT(zfsvfs);
1340 1345                  return (SET_ERROR(ENOTDIR));
1341 1346          }
1342 1347  
1343 1348          /*
1344 1349           * Check accessibility of directory.
1345 1350           */
1346 1351  
1347 1352          if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1348 1353                  ZFS_EXIT(zfsvfs);
1349 1354                  return (error);
1350 1355          }
1351 1356  
1352 1357          if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1353 1358              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1354 1359                  ZFS_EXIT(zfsvfs);
1355 1360                  return (SET_ERROR(EILSEQ));
1356 1361          }
1357 1362  
1358 1363          error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1359 1364          if (error == 0)
1360 1365                  error = specvp_check(vpp, cr);
1361 1366  
1362 1367          ZFS_EXIT(zfsvfs);
1363 1368          return (error);
1364 1369  }
1365 1370  
1366 1371  /*
1367 1372   * Attempt to create a new entry in a directory.  If the entry
1368 1373   * already exists, truncate the file if permissible, else return
1369 1374   * an error.  Return the vp of the created or trunc'd file.
1370 1375   *
1371 1376   *      IN:     dvp     - vnode of directory to put new file entry in.
1372 1377   *              name    - name of new file entry.
1373 1378   *              vap     - attributes of new file.
1374 1379   *              excl    - flag indicating exclusive or non-exclusive mode.
1375 1380   *              mode    - mode to open file with.
1376 1381   *              cr      - credentials of caller.
1377 1382   *              flag    - large file flag [UNUSED].
1378 1383   *              ct      - caller context
1379 1384   *              vsecp   - ACL to be set
1380 1385   *
1381 1386   *      OUT:    vpp     - vnode of created or trunc'd entry.
1382 1387   *
1383 1388   *      RETURN: 0 on success, error code on failure.
1384 1389   *
1385 1390   * Timestamps:
1386 1391   *      dvp - ctime|mtime updated if new entry created
1387 1392   *       vp - ctime|mtime always, atime if new
1388 1393   */
1389 1394  
1390 1395  /* ARGSUSED */
1391 1396  static int
1392 1397  zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1393 1398      int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1394 1399      vsecattr_t *vsecp)
1395 1400  {
1396 1401          znode_t         *zp, *dzp = VTOZ(dvp);
1397 1402          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1398 1403          zilog_t         *zilog;
1399 1404          objset_t        *os;
1400 1405          zfs_dirlock_t   *dl;
1401 1406          dmu_tx_t        *tx;
1402 1407          int             error;
1403 1408          ksid_t          *ksid;
1404 1409          uid_t           uid;
1405 1410          gid_t           gid = crgetgid(cr);
1406 1411          zfs_acl_ids_t   acl_ids;
1407 1412          boolean_t       fuid_dirtied;
1408 1413          boolean_t       have_acl = B_FALSE;
1409 1414          boolean_t       waited = B_FALSE;
1410 1415  
1411 1416          /*
1412 1417           * If we have an ephemeral id, ACL, or XVATTR then
1413 1418           * make sure file system is at proper version
1414 1419           */
1415 1420  
1416 1421          ksid = crgetsid(cr, KSID_OWNER);
1417 1422          if (ksid)
1418 1423                  uid = ksid_getid(ksid);
1419 1424          else
1420 1425                  uid = crgetuid(cr);
1421 1426  
1422 1427          if (zfsvfs->z_use_fuids == B_FALSE &&
1423 1428              (vsecp || (vap->va_mask & AT_XVATTR) ||
1424 1429              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1425 1430                  return (SET_ERROR(EINVAL));
1426 1431  
1427 1432          ZFS_ENTER(zfsvfs);
1428 1433          ZFS_VERIFY_ZP(dzp);
1429 1434          os = zfsvfs->z_os;
1430 1435          zilog = zfsvfs->z_log;
1431 1436  
1432 1437          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1433 1438              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1434 1439                  ZFS_EXIT(zfsvfs);
1435 1440                  return (SET_ERROR(EILSEQ));
1436 1441          }
1437 1442  
1438 1443          if (vap->va_mask & AT_XVATTR) {
1439 1444                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1440 1445                      crgetuid(cr), cr, vap->va_type)) != 0) {
1441 1446                          ZFS_EXIT(zfsvfs);
1442 1447                          return (error);
1443 1448                  }
1444 1449          }
1445 1450  top:
1446 1451          *vpp = NULL;
1447 1452  
1448 1453          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1449 1454                  vap->va_mode &= ~VSVTX;
1450 1455  
1451 1456          if (*name == '\0') {
1452 1457                  /*
1453 1458                   * Null component name refers to the directory itself.
1454 1459                   */
1455 1460                  VN_HOLD(dvp);
1456 1461                  zp = dzp;
1457 1462                  dl = NULL;
1458 1463                  error = 0;
1459 1464          } else {
1460 1465                  /* possible VN_HOLD(zp) */
1461 1466                  int zflg = 0;
1462 1467  
1463 1468                  if (flag & FIGNORECASE)
1464 1469                          zflg |= ZCILOOK;
1465 1470  
1466 1471                  error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1467 1472                      NULL, NULL);
1468 1473                  if (error) {
1469 1474                          if (have_acl)
1470 1475                                  zfs_acl_ids_free(&acl_ids);
1471 1476                          if (strcmp(name, "..") == 0)
1472 1477                                  error = SET_ERROR(EISDIR);
1473 1478                          ZFS_EXIT(zfsvfs);
1474 1479                          return (error);
1475 1480                  }
1476 1481          }
1477 1482  
1478 1483          if (zp == NULL) {
1479 1484                  uint64_t txtype;
1480 1485  
1481 1486                  /*
1482 1487                   * Create a new file object and update the directory
1483 1488                   * to reference it.
1484 1489                   */
1485 1490                  if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1486 1491                          if (have_acl)
1487 1492                                  zfs_acl_ids_free(&acl_ids);
1488 1493                          goto out;
1489 1494                  }
1490 1495  
1491 1496                  /*
1492 1497                   * We only support the creation of regular files in
1493 1498                   * extended attribute directories.
1494 1499                   */
1495 1500  
1496 1501                  if ((dzp->z_pflags & ZFS_XATTR) &&
1497 1502                      (vap->va_type != VREG)) {
1498 1503                          if (have_acl)
1499 1504                                  zfs_acl_ids_free(&acl_ids);
1500 1505                          error = SET_ERROR(EINVAL);
1501 1506                          goto out;
1502 1507                  }
1503 1508  
1504 1509                  if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1505 1510                      cr, vsecp, &acl_ids)) != 0)
1506 1511                          goto out;
1507 1512                  have_acl = B_TRUE;
1508 1513  
1509 1514                  if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1510 1515                          zfs_acl_ids_free(&acl_ids);
1511 1516                          error = SET_ERROR(EDQUOT);
1512 1517                          goto out;
1513 1518                  }
1514 1519  
1515 1520                  tx = dmu_tx_create(os);
1516 1521  
1517 1522                  dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1518 1523                      ZFS_SA_BASE_ATTR_SIZE);
1519 1524  
1520 1525                  fuid_dirtied = zfsvfs->z_fuid_dirty;
1521 1526                  if (fuid_dirtied)
1522 1527                          zfs_fuid_txhold(zfsvfs, tx);
1523 1528                  dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1524 1529                  dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1525 1530                  if (!zfsvfs->z_use_sa &&
1526 1531                      acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1527 1532                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1528 1533                              0, acl_ids.z_aclp->z_acl_bytes);
1529 1534                  }
1530 1535                  error = dmu_tx_assign(tx,
1531 1536                      (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1532 1537                  if (error) {
1533 1538                          zfs_dirent_unlock(dl);
1534 1539                          if (error == ERESTART) {
1535 1540                                  waited = B_TRUE;
1536 1541                                  dmu_tx_wait(tx);
1537 1542                                  dmu_tx_abort(tx);
1538 1543                                  goto top;
1539 1544                          }
1540 1545                          zfs_acl_ids_free(&acl_ids);
1541 1546                          dmu_tx_abort(tx);
1542 1547                          ZFS_EXIT(zfsvfs);
1543 1548                          return (error);
1544 1549                  }
1545 1550                  zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1546 1551  
1547 1552                  if (fuid_dirtied)
1548 1553                          zfs_fuid_sync(zfsvfs, tx);
1549 1554  
1550 1555                  (void) zfs_link_create(dl, zp, tx, ZNEW);
1551 1556                  txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1552 1557                  if (flag & FIGNORECASE)
1553 1558                          txtype |= TX_CI;
1554 1559                  zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1555 1560                      vsecp, acl_ids.z_fuidp, vap);
1556 1561                  zfs_acl_ids_free(&acl_ids);
1557 1562                  dmu_tx_commit(tx);
1558 1563          } else {
1559 1564                  int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1560 1565  
1561 1566                  if (have_acl)
1562 1567                          zfs_acl_ids_free(&acl_ids);
1563 1568                  have_acl = B_FALSE;
1564 1569  
1565 1570                  /*
1566 1571                   * A directory entry already exists for this name.
1567 1572                   */
1568 1573                  /*
1569 1574                   * Can't truncate an existing file if in exclusive mode.
1570 1575                   */
1571 1576                  if (excl == EXCL) {
1572 1577                          error = SET_ERROR(EEXIST);
1573 1578                          goto out;
1574 1579                  }
1575 1580                  /*
1576 1581                   * Can't open a directory for writing.
1577 1582                   */
1578 1583                  if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1579 1584                          error = SET_ERROR(EISDIR);
1580 1585                          goto out;
1581 1586                  }
1582 1587                  /*
1583 1588                   * Verify requested access to file.
1584 1589                   */
1585 1590                  if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1586 1591                          goto out;
1587 1592                  }
1588 1593  
1589 1594                  mutex_enter(&dzp->z_lock);
1590 1595                  dzp->z_seq++;
1591 1596                  mutex_exit(&dzp->z_lock);
1592 1597  
1593 1598                  /*
1594 1599                   * Truncate regular files if requested.
1595 1600                   */
1596 1601                  if ((ZTOV(zp)->v_type == VREG) &&
1597 1602                      (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1598 1603                          /* we can't hold any locks when calling zfs_freesp() */
1599 1604                          zfs_dirent_unlock(dl);
1600 1605                          dl = NULL;
1601 1606                          error = zfs_freesp(zp, 0, 0, mode, TRUE);
1602 1607                          if (error == 0) {
1603 1608                                  vnevent_create(ZTOV(zp), ct);
1604 1609                          }
1605 1610                  }
1606 1611          }
1607 1612  out:
1608 1613  
1609 1614          if (dl)
1610 1615                  zfs_dirent_unlock(dl);
1611 1616  
1612 1617          if (error) {
1613 1618                  if (zp)
1614 1619                          VN_RELE(ZTOV(zp));
1615 1620          } else {
1616 1621                  *vpp = ZTOV(zp);
1617 1622                  error = specvp_check(vpp, cr);
1618 1623          }
1619 1624  
1620 1625          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1621 1626                  zil_commit(zilog, 0);
1622 1627  
1623 1628          ZFS_EXIT(zfsvfs);
1624 1629          return (error);
1625 1630  }
1626 1631  
1627 1632  /*
1628 1633   * Remove an entry from a directory.
1629 1634   *
1630 1635   *      IN:     dvp     - vnode of directory to remove entry from.
1631 1636   *              name    - name of entry to remove.
1632 1637   *              cr      - credentials of caller.
1633 1638   *              ct      - caller context
1634 1639   *              flags   - case flags
1635 1640   *
1636 1641   *      RETURN: 0 on success, error code on failure.
1637 1642   *
1638 1643   * Timestamps:
1639 1644   *      dvp - ctime|mtime
1640 1645   *       vp - ctime (if nlink > 0)
1641 1646   */
1642 1647  
1643 1648  uint64_t null_xattr = 0;
1644 1649  
1645 1650  /*ARGSUSED*/
1646 1651  static int
1647 1652  zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1648 1653      int flags)
1649 1654  {
1650 1655          znode_t         *zp, *dzp = VTOZ(dvp);
1651 1656          znode_t         *xzp;
1652 1657          vnode_t         *vp;
1653 1658          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1654 1659          zilog_t         *zilog;
1655 1660          uint64_t        acl_obj, xattr_obj;
1656 1661          uint64_t        xattr_obj_unlinked = 0;
1657 1662          uint64_t        obj = 0;
1658 1663          zfs_dirlock_t   *dl;
1659 1664          dmu_tx_t        *tx;
1660 1665          boolean_t       may_delete_now, delete_now = FALSE;
1661 1666          boolean_t       unlinked, toobig = FALSE;
1662 1667          uint64_t        txtype;
1663 1668          pathname_t      *realnmp = NULL;
1664 1669          pathname_t      realnm;
1665 1670          int             error;
1666 1671          int             zflg = ZEXISTS;
1667 1672          boolean_t       waited = B_FALSE;
1668 1673  
1669 1674          ZFS_ENTER(zfsvfs);
1670 1675          ZFS_VERIFY_ZP(dzp);
1671 1676          zilog = zfsvfs->z_log;
1672 1677  
1673 1678          if (flags & FIGNORECASE) {
1674 1679                  zflg |= ZCILOOK;
1675 1680                  pn_alloc(&realnm);
1676 1681                  realnmp = &realnm;
1677 1682          }
1678 1683  
1679 1684  top:
1680 1685          xattr_obj = 0;
1681 1686          xzp = NULL;
1682 1687          /*
1683 1688           * Attempt to lock directory; fail if entry doesn't exist.
1684 1689           */
1685 1690          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1686 1691              NULL, realnmp)) {
1687 1692                  if (realnmp)
1688 1693                          pn_free(realnmp);
1689 1694                  ZFS_EXIT(zfsvfs);
1690 1695                  return (error);
1691 1696          }
1692 1697  
1693 1698          vp = ZTOV(zp);
1694 1699  
1695 1700          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1696 1701                  goto out;
1697 1702          }
1698 1703  
1699 1704          /*
1700 1705           * Need to use rmdir for removing directories.
1701 1706           */
1702 1707          if (vp->v_type == VDIR) {
1703 1708                  error = SET_ERROR(EPERM);
1704 1709                  goto out;
1705 1710          }
1706 1711  
1707 1712          vnevent_remove(vp, dvp, name, ct);
1708 1713  
1709 1714          if (realnmp)
1710 1715                  dnlc_remove(dvp, realnmp->pn_buf);
1711 1716          else
1712 1717                  dnlc_remove(dvp, name);
1713 1718  
1714 1719          mutex_enter(&vp->v_lock);
1715 1720          may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1716 1721          mutex_exit(&vp->v_lock);
1717 1722  
1718 1723          /*
1719 1724           * We may delete the znode now, or we may put it in the unlinked set;
1720 1725           * it depends on whether we're the last link, and on whether there are
1721 1726           * other holds on the vnode.  So we dmu_tx_hold() the right things to
1722 1727           * allow for either case.
1723 1728           */
1724 1729          obj = zp->z_id;
1725 1730          tx = dmu_tx_create(zfsvfs->z_os);
1726 1731          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1727 1732          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1728 1733          zfs_sa_upgrade_txholds(tx, zp);
1729 1734          zfs_sa_upgrade_txholds(tx, dzp);
1730 1735          if (may_delete_now) {
1731 1736                  toobig =
1732 1737                      zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1733 1738                  /* if the file is too big, only hold_free a token amount */
1734 1739                  dmu_tx_hold_free(tx, zp->z_id, 0,
1735 1740                      (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1736 1741          }
1737 1742  
1738 1743          /* are there any extended attributes? */
1739 1744          error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1740 1745              &xattr_obj, sizeof (xattr_obj));
1741 1746          if (error == 0 && xattr_obj) {
1742 1747                  error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1743 1748                  ASSERT0(error);
1744 1749                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1745 1750                  dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1746 1751          }
1747 1752  
1748 1753          mutex_enter(&zp->z_lock);
1749 1754          if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1750 1755                  dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1751 1756          mutex_exit(&zp->z_lock);
1752 1757  
1753 1758          /* charge as an update -- would be nice not to charge at all */
1754 1759          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1755 1760  
1756 1761          /*
1757 1762           * Mark this transaction as typically resulting in a net free of space
1758 1763           */
1759 1764          dmu_tx_mark_netfree(tx);
1760 1765  
1761 1766          error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1762 1767          if (error) {
1763 1768                  zfs_dirent_unlock(dl);
1764 1769                  VN_RELE(vp);
1765 1770                  if (xzp)
1766 1771                          VN_RELE(ZTOV(xzp));
1767 1772                  if (error == ERESTART) {
1768 1773                          waited = B_TRUE;
1769 1774                          dmu_tx_wait(tx);
1770 1775                          dmu_tx_abort(tx);
1771 1776                          goto top;
1772 1777                  }
1773 1778                  if (realnmp)
1774 1779                          pn_free(realnmp);
1775 1780                  dmu_tx_abort(tx);
1776 1781                  ZFS_EXIT(zfsvfs);
1777 1782                  return (error);
1778 1783          }
1779 1784  
1780 1785          /*
1781 1786           * Remove the directory entry.
1782 1787           */
1783 1788          error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1784 1789  
1785 1790          if (error) {
1786 1791                  dmu_tx_commit(tx);
1787 1792                  goto out;
1788 1793          }
1789 1794  
1790 1795          if (unlinked) {
1791 1796                  /*
1792 1797                   * Hold z_lock so that we can make sure that the ACL obj
1793 1798                   * hasn't changed.  Could have been deleted due to
1794 1799                   * zfs_sa_upgrade().
1795 1800                   */
1796 1801                  mutex_enter(&zp->z_lock);
1797 1802                  mutex_enter(&vp->v_lock);
1798 1803                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1799 1804                      &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1800 1805                  delete_now = may_delete_now && !toobig &&
1801 1806                      vp->v_count == 1 && !vn_has_cached_data(vp) &&
1802 1807                      xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1803 1808                      acl_obj;
1804 1809                  mutex_exit(&vp->v_lock);
1805 1810          }
1806 1811  
1807 1812          if (delete_now) {
1808 1813                  if (xattr_obj_unlinked) {
1809 1814                          ASSERT3U(xzp->z_links, ==, 2);
1810 1815                          mutex_enter(&xzp->z_lock);
1811 1816                          xzp->z_unlinked = 1;
1812 1817                          xzp->z_links = 0;
1813 1818                          error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1814 1819                              &xzp->z_links, sizeof (xzp->z_links), tx);
1815 1820                          ASSERT3U(error,  ==,  0);
1816 1821                          mutex_exit(&xzp->z_lock);
1817 1822                          zfs_unlinked_add(xzp, tx);
1818 1823  
1819 1824                          if (zp->z_is_sa)
1820 1825                                  error = sa_remove(zp->z_sa_hdl,
1821 1826                                      SA_ZPL_XATTR(zfsvfs), tx);
1822 1827                          else
1823 1828                                  error = sa_update(zp->z_sa_hdl,
1824 1829                                      SA_ZPL_XATTR(zfsvfs), &null_xattr,
1825 1830                                      sizeof (uint64_t), tx);
1826 1831                          ASSERT0(error);
1827 1832                  }
1828 1833                  mutex_enter(&vp->v_lock);
1829 1834                  VN_RELE_LOCKED(vp);
1830 1835                  ASSERT0(vp->v_count);
1831 1836                  mutex_exit(&vp->v_lock);
1832 1837                  mutex_exit(&zp->z_lock);
1833 1838                  zfs_znode_delete(zp, tx);
1834 1839          } else if (unlinked) {
1835 1840                  mutex_exit(&zp->z_lock);
1836 1841                  zfs_unlinked_add(zp, tx);
1837 1842          }
1838 1843  
1839 1844          txtype = TX_REMOVE;
1840 1845          if (flags & FIGNORECASE)
1841 1846                  txtype |= TX_CI;
1842 1847          zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1843 1848  
1844 1849          dmu_tx_commit(tx);
1845 1850  out:
1846 1851          if (realnmp)
1847 1852                  pn_free(realnmp);
1848 1853  
1849 1854          zfs_dirent_unlock(dl);
1850 1855  
1851 1856          if (!delete_now)
1852 1857                  VN_RELE(vp);
1853 1858          if (xzp)
1854 1859                  VN_RELE(ZTOV(xzp));
1855 1860  
1856 1861          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1857 1862                  zil_commit(zilog, 0);
1858 1863  
1859 1864          ZFS_EXIT(zfsvfs);
1860 1865          return (error);
1861 1866  }
1862 1867  
1863 1868  /*
1864 1869   * Create a new directory and insert it into dvp using the name
1865 1870   * provided.  Return a pointer to the inserted directory.
1866 1871   *
1867 1872   *      IN:     dvp     - vnode of directory to add subdir to.
1868 1873   *              dirname - name of new directory.
1869 1874   *              vap     - attributes of new directory.
1870 1875   *              cr      - credentials of caller.
1871 1876   *              ct      - caller context
1872 1877   *              flags   - case flags
1873 1878   *              vsecp   - ACL to be set
1874 1879   *
1875 1880   *      OUT:    vpp     - vnode of created directory.
1876 1881   *
1877 1882   *      RETURN: 0 on success, error code on failure.
1878 1883   *
1879 1884   * Timestamps:
1880 1885   *      dvp - ctime|mtime updated
1881 1886   *       vp - ctime|mtime|atime updated
1882 1887   */
1883 1888  /*ARGSUSED*/
1884 1889  static int
1885 1890  zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1886 1891      caller_context_t *ct, int flags, vsecattr_t *vsecp)
1887 1892  {
1888 1893          znode_t         *zp, *dzp = VTOZ(dvp);
1889 1894          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1890 1895          zilog_t         *zilog;
1891 1896          zfs_dirlock_t   *dl;
1892 1897          uint64_t        txtype;
1893 1898          dmu_tx_t        *tx;
1894 1899          int             error;
1895 1900          int             zf = ZNEW;
1896 1901          ksid_t          *ksid;
1897 1902          uid_t           uid;
1898 1903          gid_t           gid = crgetgid(cr);
1899 1904          zfs_acl_ids_t   acl_ids;
1900 1905          boolean_t       fuid_dirtied;
1901 1906          boolean_t       waited = B_FALSE;
1902 1907  
1903 1908          ASSERT(vap->va_type == VDIR);
1904 1909  
1905 1910          /*
1906 1911           * If we have an ephemeral id, ACL, or XVATTR then
1907 1912           * make sure file system is at proper version
1908 1913           */
1909 1914  
1910 1915          ksid = crgetsid(cr, KSID_OWNER);
1911 1916          if (ksid)
1912 1917                  uid = ksid_getid(ksid);
1913 1918          else
1914 1919                  uid = crgetuid(cr);
1915 1920          if (zfsvfs->z_use_fuids == B_FALSE &&
1916 1921              (vsecp || (vap->va_mask & AT_XVATTR) ||
1917 1922              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1918 1923                  return (SET_ERROR(EINVAL));
1919 1924  
1920 1925          ZFS_ENTER(zfsvfs);
1921 1926          ZFS_VERIFY_ZP(dzp);
1922 1927          zilog = zfsvfs->z_log;
1923 1928  
1924 1929          if (dzp->z_pflags & ZFS_XATTR) {
1925 1930                  ZFS_EXIT(zfsvfs);
1926 1931                  return (SET_ERROR(EINVAL));
1927 1932          }
1928 1933  
1929 1934          if (zfsvfs->z_utf8 && u8_validate(dirname,
1930 1935              strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1931 1936                  ZFS_EXIT(zfsvfs);
1932 1937                  return (SET_ERROR(EILSEQ));
1933 1938          }
1934 1939          if (flags & FIGNORECASE)
1935 1940                  zf |= ZCILOOK;
1936 1941  
1937 1942          if (vap->va_mask & AT_XVATTR) {
1938 1943                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1939 1944                      crgetuid(cr), cr, vap->va_type)) != 0) {
1940 1945                          ZFS_EXIT(zfsvfs);
1941 1946                          return (error);
1942 1947                  }
1943 1948          }
1944 1949  
1945 1950          if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1946 1951              vsecp, &acl_ids)) != 0) {
1947 1952                  ZFS_EXIT(zfsvfs);
1948 1953                  return (error);
1949 1954          }
1950 1955          /*
1951 1956           * First make sure the new directory doesn't exist.
1952 1957           *
1953 1958           * Existence is checked first to make sure we don't return
1954 1959           * EACCES instead of EEXIST which can cause some applications
1955 1960           * to fail.
1956 1961           */
1957 1962  top:
1958 1963          *vpp = NULL;
1959 1964  
1960 1965          if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1961 1966              NULL, NULL)) {
1962 1967                  zfs_acl_ids_free(&acl_ids);
1963 1968                  ZFS_EXIT(zfsvfs);
1964 1969                  return (error);
1965 1970          }
1966 1971  
1967 1972          if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1968 1973                  zfs_acl_ids_free(&acl_ids);
1969 1974                  zfs_dirent_unlock(dl);
1970 1975                  ZFS_EXIT(zfsvfs);
1971 1976                  return (error);
1972 1977          }
1973 1978  
1974 1979          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1975 1980                  zfs_acl_ids_free(&acl_ids);
1976 1981                  zfs_dirent_unlock(dl);
1977 1982                  ZFS_EXIT(zfsvfs);
1978 1983                  return (SET_ERROR(EDQUOT));
1979 1984          }
1980 1985  
1981 1986          /*
1982 1987           * Add a new entry to the directory.
1983 1988           */
1984 1989          tx = dmu_tx_create(zfsvfs->z_os);
1985 1990          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1986 1991          dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1987 1992          fuid_dirtied = zfsvfs->z_fuid_dirty;
1988 1993          if (fuid_dirtied)
1989 1994                  zfs_fuid_txhold(zfsvfs, tx);
1990 1995          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1991 1996                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1992 1997                      acl_ids.z_aclp->z_acl_bytes);
1993 1998          }
1994 1999  
1995 2000          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1996 2001              ZFS_SA_BASE_ATTR_SIZE);
1997 2002  
1998 2003          error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1999 2004          if (error) {
2000 2005                  zfs_dirent_unlock(dl);
2001 2006                  if (error == ERESTART) {
2002 2007                          waited = B_TRUE;
2003 2008                          dmu_tx_wait(tx);
2004 2009                          dmu_tx_abort(tx);
2005 2010                          goto top;
2006 2011                  }
2007 2012                  zfs_acl_ids_free(&acl_ids);
2008 2013                  dmu_tx_abort(tx);
2009 2014                  ZFS_EXIT(zfsvfs);
2010 2015                  return (error);
2011 2016          }
2012 2017  
2013 2018          /*
2014 2019           * Create new node.
2015 2020           */
2016 2021          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2017 2022  
2018 2023          if (fuid_dirtied)
2019 2024                  zfs_fuid_sync(zfsvfs, tx);
2020 2025  
2021 2026          /*
2022 2027           * Now put new name in parent dir.
2023 2028           */
2024 2029          (void) zfs_link_create(dl, zp, tx, ZNEW);
2025 2030  
2026 2031          *vpp = ZTOV(zp);
2027 2032  
2028 2033          txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2029 2034          if (flags & FIGNORECASE)
2030 2035                  txtype |= TX_CI;
2031 2036          zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2032 2037              acl_ids.z_fuidp, vap);
2033 2038  
2034 2039          zfs_acl_ids_free(&acl_ids);
2035 2040  
2036 2041          dmu_tx_commit(tx);
2037 2042  
2038 2043          zfs_dirent_unlock(dl);
2039 2044  
2040 2045          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2041 2046                  zil_commit(zilog, 0);
2042 2047  
2043 2048          ZFS_EXIT(zfsvfs);
2044 2049          return (0);
2045 2050  }
2046 2051  
2047 2052  /*
2048 2053   * Remove a directory subdir entry.  If the current working
2049 2054   * directory is the same as the subdir to be removed, the
2050 2055   * remove will fail.
2051 2056   *
2052 2057   *      IN:     dvp     - vnode of directory to remove from.
2053 2058   *              name    - name of directory to be removed.
2054 2059   *              cwd     - vnode of current working directory.
2055 2060   *              cr      - credentials of caller.
2056 2061   *              ct      - caller context
2057 2062   *              flags   - case flags
2058 2063   *
2059 2064   *      RETURN: 0 on success, error code on failure.
2060 2065   *
2061 2066   * Timestamps:
2062 2067   *      dvp - ctime|mtime updated
2063 2068   */
2064 2069  /*ARGSUSED*/
2065 2070  static int
2066 2071  zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2067 2072      caller_context_t *ct, int flags)
2068 2073  {
2069 2074          znode_t         *dzp = VTOZ(dvp);
2070 2075          znode_t         *zp;
2071 2076          vnode_t         *vp;
2072 2077          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2073 2078          zilog_t         *zilog;
2074 2079          zfs_dirlock_t   *dl;
2075 2080          dmu_tx_t        *tx;
2076 2081          int             error;
2077 2082          int             zflg = ZEXISTS;
2078 2083          boolean_t       waited = B_FALSE;
2079 2084  
2080 2085          ZFS_ENTER(zfsvfs);
2081 2086          ZFS_VERIFY_ZP(dzp);
2082 2087          zilog = zfsvfs->z_log;
2083 2088  
2084 2089          if (flags & FIGNORECASE)
2085 2090                  zflg |= ZCILOOK;
2086 2091  top:
2087 2092          zp = NULL;
2088 2093  
2089 2094          /*
2090 2095           * Attempt to lock directory; fail if entry doesn't exist.
2091 2096           */
2092 2097          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2093 2098              NULL, NULL)) {
2094 2099                  ZFS_EXIT(zfsvfs);
2095 2100                  return (error);
2096 2101          }
2097 2102  
2098 2103          vp = ZTOV(zp);
2099 2104  
2100 2105          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2101 2106                  goto out;
2102 2107          }
2103 2108  
2104 2109          if (vp->v_type != VDIR) {
2105 2110                  error = SET_ERROR(ENOTDIR);
2106 2111                  goto out;
2107 2112          }
2108 2113  
2109 2114          if (vp == cwd) {
2110 2115                  error = SET_ERROR(EINVAL);
2111 2116                  goto out;
2112 2117          }
2113 2118  
2114 2119          vnevent_rmdir(vp, dvp, name, ct);
2115 2120  
2116 2121          /*
2117 2122           * Grab a lock on the directory to make sure that noone is
2118 2123           * trying to add (or lookup) entries while we are removing it.
2119 2124           */
2120 2125          rw_enter(&zp->z_name_lock, RW_WRITER);
2121 2126  
2122 2127          /*
2123 2128           * Grab a lock on the parent pointer to make sure we play well
2124 2129           * with the treewalk and directory rename code.
2125 2130           */
2126 2131          rw_enter(&zp->z_parent_lock, RW_WRITER);
2127 2132  
2128 2133          tx = dmu_tx_create(zfsvfs->z_os);
2129 2134          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2130 2135          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2131 2136          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2132 2137          zfs_sa_upgrade_txholds(tx, zp);
2133 2138          zfs_sa_upgrade_txholds(tx, dzp);
2134 2139          dmu_tx_mark_netfree(tx);
2135 2140          error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2136 2141          if (error) {
2137 2142                  rw_exit(&zp->z_parent_lock);
2138 2143                  rw_exit(&zp->z_name_lock);
2139 2144                  zfs_dirent_unlock(dl);
2140 2145                  VN_RELE(vp);
2141 2146                  if (error == ERESTART) {
2142 2147                          waited = B_TRUE;
2143 2148                          dmu_tx_wait(tx);
2144 2149                          dmu_tx_abort(tx);
2145 2150                          goto top;
2146 2151                  }
2147 2152                  dmu_tx_abort(tx);
2148 2153                  ZFS_EXIT(zfsvfs);
2149 2154                  return (error);
2150 2155          }
2151 2156  
2152 2157          error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2153 2158  
2154 2159          if (error == 0) {
2155 2160                  uint64_t txtype = TX_RMDIR;
2156 2161                  if (flags & FIGNORECASE)
2157 2162                          txtype |= TX_CI;
2158 2163                  zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2159 2164          }
2160 2165  
2161 2166          dmu_tx_commit(tx);
2162 2167  
2163 2168          rw_exit(&zp->z_parent_lock);
2164 2169          rw_exit(&zp->z_name_lock);
2165 2170  out:
2166 2171          zfs_dirent_unlock(dl);
2167 2172  
2168 2173          VN_RELE(vp);
2169 2174  
2170 2175          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2171 2176                  zil_commit(zilog, 0);
2172 2177  
2173 2178          ZFS_EXIT(zfsvfs);
2174 2179          return (error);
2175 2180  }
2176 2181  
2177 2182  /*
2178 2183   * Read as many directory entries as will fit into the provided
2179 2184   * buffer from the given directory cursor position (specified in
2180 2185   * the uio structure).
2181 2186   *
2182 2187   *      IN:     vp      - vnode of directory to read.
2183 2188   *              uio     - structure supplying read location, range info,
2184 2189   *                        and return buffer.
2185 2190   *              cr      - credentials of caller.
2186 2191   *              ct      - caller context
2187 2192   *              flags   - case flags
2188 2193   *
2189 2194   *      OUT:    uio     - updated offset and range, buffer filled.
2190 2195   *              eofp    - set to true if end-of-file detected.
2191 2196   *
2192 2197   *      RETURN: 0 on success, error code on failure.
2193 2198   *
2194 2199   * Timestamps:
2195 2200   *      vp - atime updated
2196 2201   *
2197 2202   * Note that the low 4 bits of the cookie returned by zap is always zero.
2198 2203   * This allows us to use the low range for "special" directory entries:
2199 2204   * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2200 2205   * we use the offset 2 for the '.zfs' directory.
2201 2206   */
2202 2207  /* ARGSUSED */
2203 2208  static int
2204 2209  zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2205 2210      caller_context_t *ct, int flags)
2206 2211  {
2207 2212          znode_t         *zp = VTOZ(vp);
2208 2213          iovec_t         *iovp;
2209 2214          edirent_t       *eodp;
2210 2215          dirent64_t      *odp;
2211 2216          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2212 2217          objset_t        *os;
2213 2218          caddr_t         outbuf;
2214 2219          size_t          bufsize;
2215 2220          zap_cursor_t    zc;
2216 2221          zap_attribute_t zap;
2217 2222          uint_t          bytes_wanted;
2218 2223          uint64_t        offset; /* must be unsigned; checks for < 1 */
2219 2224          uint64_t        parent;
2220 2225          int             local_eof;
2221 2226          int             outcount;
2222 2227          int             error;
2223 2228          uint8_t         prefetch;
2224 2229          boolean_t       check_sysattrs;
2225 2230  
2226 2231          ZFS_ENTER(zfsvfs);
2227 2232          ZFS_VERIFY_ZP(zp);
2228 2233  
2229 2234          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2230 2235              &parent, sizeof (parent))) != 0) {
2231 2236                  ZFS_EXIT(zfsvfs);
2232 2237                  return (error);
2233 2238          }
2234 2239  
2235 2240          /*
2236 2241           * If we are not given an eof variable,
2237 2242           * use a local one.
2238 2243           */
2239 2244          if (eofp == NULL)
2240 2245                  eofp = &local_eof;
2241 2246  
2242 2247          /*
2243 2248           * Check for valid iov_len.
2244 2249           */
2245 2250          if (uio->uio_iov->iov_len <= 0) {
2246 2251                  ZFS_EXIT(zfsvfs);
2247 2252                  return (SET_ERROR(EINVAL));
2248 2253          }
2249 2254  
2250 2255          /*
2251 2256           * Quit if directory has been removed (posix)
2252 2257           */
2253 2258          if ((*eofp = zp->z_unlinked) != 0) {
2254 2259                  ZFS_EXIT(zfsvfs);
2255 2260                  return (0);
2256 2261          }
2257 2262  
2258 2263          error = 0;
2259 2264          os = zfsvfs->z_os;
2260 2265          offset = uio->uio_loffset;
2261 2266          prefetch = zp->z_zn_prefetch;
2262 2267  
2263 2268          /*
2264 2269           * Initialize the iterator cursor.
2265 2270           */
2266 2271          if (offset <= 3) {
2267 2272                  /*
2268 2273                   * Start iteration from the beginning of the directory.
2269 2274                   */
2270 2275                  zap_cursor_init(&zc, os, zp->z_id);
2271 2276          } else {
2272 2277                  /*
2273 2278                   * The offset is a serialized cursor.
2274 2279                   */
2275 2280                  zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2276 2281          }
2277 2282  
2278 2283          /*
2279 2284           * Get space to change directory entries into fs independent format.
2280 2285           */
2281 2286          iovp = uio->uio_iov;
2282 2287          bytes_wanted = iovp->iov_len;
2283 2288          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2284 2289                  bufsize = bytes_wanted;
2285 2290                  outbuf = kmem_alloc(bufsize, KM_SLEEP);
2286 2291                  odp = (struct dirent64 *)outbuf;
2287 2292          } else {
2288 2293                  bufsize = bytes_wanted;
2289 2294                  outbuf = NULL;
2290 2295                  odp = (struct dirent64 *)iovp->iov_base;
2291 2296          }
2292 2297          eodp = (struct edirent *)odp;
2293 2298  
2294 2299          /*
2295 2300           * If this VFS supports the system attribute view interface; and
2296 2301           * we're looking at an extended attribute directory; and we care
2297 2302           * about normalization conflicts on this vfs; then we must check
2298 2303           * for normalization conflicts with the sysattr name space.
2299 2304           */
2300 2305          check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2301 2306              (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2302 2307              (flags & V_RDDIR_ENTFLAGS);
2303 2308  
2304 2309          /*
2305 2310           * Transform to file-system independent format
2306 2311           */
2307 2312          outcount = 0;
2308 2313          while (outcount < bytes_wanted) {
2309 2314                  ino64_t objnum;
2310 2315                  ushort_t reclen;
2311 2316                  off64_t *next = NULL;
2312 2317  
2313 2318                  /*
2314 2319                   * Special case `.', `..', and `.zfs'.
2315 2320                   */
2316 2321                  if (offset == 0) {
2317 2322                          (void) strcpy(zap.za_name, ".");
2318 2323                          zap.za_normalization_conflict = 0;
2319 2324                          objnum = zp->z_id;
2320 2325                  } else if (offset == 1) {
2321 2326                          (void) strcpy(zap.za_name, "..");
2322 2327                          zap.za_normalization_conflict = 0;
2323 2328                          objnum = parent;
2324 2329                  } else if (offset == 2 && zfs_show_ctldir(zp)) {
2325 2330                          (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2326 2331                          zap.za_normalization_conflict = 0;
2327 2332                          objnum = ZFSCTL_INO_ROOT;
2328 2333                  } else {
2329 2334                          /*
2330 2335                           * Grab next entry.
2331 2336                           */
2332 2337                          if (error = zap_cursor_retrieve(&zc, &zap)) {
2333 2338                                  if ((*eofp = (error == ENOENT)) != 0)
2334 2339                                          break;
2335 2340                                  else
2336 2341                                          goto update;
2337 2342                          }
2338 2343  
2339 2344                          if (zap.za_integer_length != 8 ||
2340 2345                              zap.za_num_integers != 1) {
2341 2346                                  cmn_err(CE_WARN, "zap_readdir: bad directory "
2342 2347                                      "entry, obj = %lld, offset = %lld\n",
2343 2348                                      (u_longlong_t)zp->z_id,
2344 2349                                      (u_longlong_t)offset);
2345 2350                                  error = SET_ERROR(ENXIO);
2346 2351                                  goto update;
2347 2352                          }
2348 2353  
2349 2354                          objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2350 2355                          /*
2351 2356                           * MacOS X can extract the object type here such as:
2352 2357                           * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2353 2358                           */
2354 2359  
2355 2360                          if (check_sysattrs && !zap.za_normalization_conflict) {
2356 2361                                  zap.za_normalization_conflict =
2357 2362                                      xattr_sysattr_casechk(zap.za_name);
2358 2363                          }
2359 2364                  }
2360 2365  
2361 2366                  if (flags & V_RDDIR_ACCFILTER) {
2362 2367                          /*
2363 2368                           * If we have no access at all, don't include
2364 2369                           * this entry in the returned information
2365 2370                           */
2366 2371                          znode_t *ezp;
2367 2372                          if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2368 2373                                  goto skip_entry;
2369 2374                          if (!zfs_has_access(ezp, cr)) {
2370 2375                                  VN_RELE(ZTOV(ezp));
2371 2376                                  goto skip_entry;
2372 2377                          }
2373 2378                          VN_RELE(ZTOV(ezp));
2374 2379                  }
2375 2380  
2376 2381                  if (flags & V_RDDIR_ENTFLAGS)
2377 2382                          reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2378 2383                  else
2379 2384                          reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2380 2385  
2381 2386                  /*
2382 2387                   * Will this entry fit in the buffer?
2383 2388                   */
2384 2389                  if (outcount + reclen > bufsize) {
2385 2390                          /*
2386 2391                           * Did we manage to fit anything in the buffer?
2387 2392                           */
2388 2393                          if (!outcount) {
2389 2394                                  error = SET_ERROR(EINVAL);
2390 2395                                  goto update;
2391 2396                          }
2392 2397                          break;
2393 2398                  }
2394 2399                  if (flags & V_RDDIR_ENTFLAGS) {
2395 2400                          /*
2396 2401                           * Add extended flag entry:
2397 2402                           */
2398 2403                          eodp->ed_ino = objnum;
2399 2404                          eodp->ed_reclen = reclen;
2400 2405                          /* NOTE: ed_off is the offset for the *next* entry */
2401 2406                          next = &(eodp->ed_off);
2402 2407                          eodp->ed_eflags = zap.za_normalization_conflict ?
2403 2408                              ED_CASE_CONFLICT : 0;
2404 2409                          (void) strncpy(eodp->ed_name, zap.za_name,
2405 2410                              EDIRENT_NAMELEN(reclen));
2406 2411                          eodp = (edirent_t *)((intptr_t)eodp + reclen);
2407 2412                  } else {
2408 2413                          /*
2409 2414                           * Add normal entry:
2410 2415                           */
2411 2416                          odp->d_ino = objnum;
2412 2417                          odp->d_reclen = reclen;
2413 2418                          /* NOTE: d_off is the offset for the *next* entry */
2414 2419                          next = &(odp->d_off);
2415 2420                          (void) strncpy(odp->d_name, zap.za_name,
2416 2421                              DIRENT64_NAMELEN(reclen));
2417 2422                          odp = (dirent64_t *)((intptr_t)odp + reclen);
2418 2423                  }
2419 2424                  outcount += reclen;
2420 2425  
2421 2426                  ASSERT(outcount <= bufsize);
2422 2427  
2423 2428                  /* Prefetch znode */
2424 2429                  if (prefetch)
2425 2430                          dmu_prefetch(os, objnum, 0, 0, 0,
2426 2431                              ZIO_PRIORITY_SYNC_READ);
2427 2432  
2428 2433          skip_entry:
2429 2434                  /*
2430 2435                   * Move to the next entry, fill in the previous offset.
2431 2436                   */
2432 2437                  if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2433 2438                          zap_cursor_advance(&zc);
2434 2439                          offset = zap_cursor_serialize(&zc);
2435 2440                  } else {
2436 2441                          offset += 1;
2437 2442                  }
2438 2443                  if (next)
2439 2444                          *next = offset;
2440 2445          }
2441 2446          zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2442 2447  
2443 2448          if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2444 2449                  iovp->iov_base += outcount;
2445 2450                  iovp->iov_len -= outcount;
2446 2451                  uio->uio_resid -= outcount;
2447 2452          } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2448 2453                  /*
2449 2454                   * Reset the pointer.
2450 2455                   */
2451 2456                  offset = uio->uio_loffset;
2452 2457          }
2453 2458  
2454 2459  update:
2455 2460          zap_cursor_fini(&zc);
2456 2461          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2457 2462                  kmem_free(outbuf, bufsize);
2458 2463  
2459 2464          if (error == ENOENT)
2460 2465                  error = 0;
2461 2466  
2462 2467          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2463 2468  
2464 2469          uio->uio_loffset = offset;
2465 2470          ZFS_EXIT(zfsvfs);
2466 2471          return (error);
2467 2472  }
2468 2473  
2469 2474  ulong_t zfs_fsync_sync_cnt = 4;
2470 2475  
2471 2476  static int
2472 2477  zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2473 2478  {
2474 2479          znode_t *zp = VTOZ(vp);
2475 2480          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2476 2481  
2477 2482          /*
2478 2483           * Regardless of whether this is required for standards conformance,
2479 2484           * this is the logical behavior when fsync() is called on a file with
2480 2485           * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2481 2486           * going to be pushed out as part of the zil_commit().
2482 2487           */
2483 2488          if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2484 2489              (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2485 2490                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2486 2491  
2487 2492          (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2488 2493  
2489 2494          if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2490 2495                  ZFS_ENTER(zfsvfs);
2491 2496                  ZFS_VERIFY_ZP(zp);
2492 2497                  zil_commit(zfsvfs->z_log, zp->z_id);
2493 2498                  ZFS_EXIT(zfsvfs);
2494 2499          }
2495 2500          return (0);
2496 2501  }
2497 2502  
2498 2503  
2499 2504  /*
2500 2505   * Get the requested file attributes and place them in the provided
2501 2506   * vattr structure.
2502 2507   *
2503 2508   *      IN:     vp      - vnode of file.
2504 2509   *              vap     - va_mask identifies requested attributes.
2505 2510   *                        If AT_XVATTR set, then optional attrs are requested
2506 2511   *              flags   - ATTR_NOACLCHECK (CIFS server context)
2507 2512   *              cr      - credentials of caller.
2508 2513   *              ct      - caller context
2509 2514   *
2510 2515   *      OUT:    vap     - attribute values.
2511 2516   *
2512 2517   *      RETURN: 0 (always succeeds).
2513 2518   */
2514 2519  /* ARGSUSED */
2515 2520  static int
2516 2521  zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2517 2522      caller_context_t *ct)
2518 2523  {
2519 2524          znode_t *zp = VTOZ(vp);
2520 2525          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2521 2526          int     error = 0;
2522 2527          uint64_t links;
2523 2528          uint64_t mtime[2], ctime[2];
2524 2529          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2525 2530          xoptattr_t *xoap = NULL;
2526 2531          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2527 2532          sa_bulk_attr_t bulk[2];
2528 2533          int count = 0;
2529 2534  
2530 2535          ZFS_ENTER(zfsvfs);
2531 2536          ZFS_VERIFY_ZP(zp);
2532 2537  
2533 2538          zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2534 2539  
2535 2540          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2536 2541          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2537 2542  
2538 2543          if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2539 2544                  ZFS_EXIT(zfsvfs);
2540 2545                  return (error);
2541 2546          }
2542 2547  
2543 2548          /*
2544 2549           * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2545 2550           * Also, if we are the owner don't bother, since owner should
2546 2551           * always be allowed to read basic attributes of file.
2547 2552           */
2548 2553          if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2549 2554              (vap->va_uid != crgetuid(cr))) {
2550 2555                  if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2551 2556                      skipaclchk, cr)) {
2552 2557                          ZFS_EXIT(zfsvfs);
2553 2558                          return (error);
2554 2559                  }
2555 2560          }
2556 2561  
2557 2562          /*
2558 2563           * Return all attributes.  It's cheaper to provide the answer
2559 2564           * than to determine whether we were asked the question.
2560 2565           */
2561 2566  
2562 2567          mutex_enter(&zp->z_lock);
2563 2568          vap->va_type = vp->v_type;
2564 2569          vap->va_mode = zp->z_mode & MODEMASK;
2565 2570          vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2566 2571          vap->va_nodeid = zp->z_id;
2567 2572          if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2568 2573                  links = zp->z_links + 1;
2569 2574          else
2570 2575                  links = zp->z_links;
2571 2576          vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2572 2577          vap->va_size = zp->z_size;
2573 2578          vap->va_rdev = vp->v_rdev;
2574 2579          vap->va_seq = zp->z_seq;
2575 2580  
2576 2581          /*
2577 2582           * Add in any requested optional attributes and the create time.
2578 2583           * Also set the corresponding bits in the returned attribute bitmap.
2579 2584           */
2580 2585          if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2581 2586                  if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2582 2587                          xoap->xoa_archive =
2583 2588                              ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2584 2589                          XVA_SET_RTN(xvap, XAT_ARCHIVE);
2585 2590                  }
2586 2591  
2587 2592                  if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2588 2593                          xoap->xoa_readonly =
2589 2594                              ((zp->z_pflags & ZFS_READONLY) != 0);
2590 2595                          XVA_SET_RTN(xvap, XAT_READONLY);
2591 2596                  }
2592 2597  
2593 2598                  if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2594 2599                          xoap->xoa_system =
2595 2600                              ((zp->z_pflags & ZFS_SYSTEM) != 0);
2596 2601                          XVA_SET_RTN(xvap, XAT_SYSTEM);
2597 2602                  }
2598 2603  
2599 2604                  if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2600 2605                          xoap->xoa_hidden =
2601 2606                              ((zp->z_pflags & ZFS_HIDDEN) != 0);
2602 2607                          XVA_SET_RTN(xvap, XAT_HIDDEN);
2603 2608                  }
2604 2609  
2605 2610                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2606 2611                          xoap->xoa_nounlink =
2607 2612                              ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2608 2613                          XVA_SET_RTN(xvap, XAT_NOUNLINK);
2609 2614                  }
2610 2615  
2611 2616                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2612 2617                          xoap->xoa_immutable =
2613 2618                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2614 2619                          XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2615 2620                  }
2616 2621  
2617 2622                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2618 2623                          xoap->xoa_appendonly =
2619 2624                              ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2620 2625                          XVA_SET_RTN(xvap, XAT_APPENDONLY);
2621 2626                  }
2622 2627  
2623 2628                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2624 2629                          xoap->xoa_nodump =
2625 2630                              ((zp->z_pflags & ZFS_NODUMP) != 0);
2626 2631                          XVA_SET_RTN(xvap, XAT_NODUMP);
2627 2632                  }
2628 2633  
2629 2634                  if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2630 2635                          xoap->xoa_opaque =
2631 2636                              ((zp->z_pflags & ZFS_OPAQUE) != 0);
2632 2637                          XVA_SET_RTN(xvap, XAT_OPAQUE);
2633 2638                  }
2634 2639  
2635 2640                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2636 2641                          xoap->xoa_av_quarantined =
2637 2642                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2638 2643                          XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2639 2644                  }
2640 2645  
2641 2646                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2642 2647                          xoap->xoa_av_modified =
2643 2648                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2644 2649                          XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2645 2650                  }
2646 2651  
2647 2652                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2648 2653                      vp->v_type == VREG) {
2649 2654                          zfs_sa_get_scanstamp(zp, xvap);
2650 2655                  }
2651 2656  
2652 2657                  if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2653 2658                          uint64_t times[2];
2654 2659  
2655 2660                          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2656 2661                              times, sizeof (times));
2657 2662                          ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2658 2663                          XVA_SET_RTN(xvap, XAT_CREATETIME);
2659 2664                  }
2660 2665  
2661 2666                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2662 2667                          xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2663 2668                          XVA_SET_RTN(xvap, XAT_REPARSE);
2664 2669                  }
2665 2670                  if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2666 2671                          xoap->xoa_generation = zp->z_gen;
2667 2672                          XVA_SET_RTN(xvap, XAT_GEN);
2668 2673                  }
2669 2674  
2670 2675                  if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2671 2676                          xoap->xoa_offline =
2672 2677                              ((zp->z_pflags & ZFS_OFFLINE) != 0);
2673 2678                          XVA_SET_RTN(xvap, XAT_OFFLINE);
2674 2679                  }
2675 2680  
2676 2681                  if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2677 2682                          xoap->xoa_sparse =
2678 2683                              ((zp->z_pflags & ZFS_SPARSE) != 0);
2679 2684                          XVA_SET_RTN(xvap, XAT_SPARSE);
2680 2685                  }
2681 2686          }
2682 2687  
2683 2688          ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2684 2689          ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2685 2690          ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2686 2691  
2687 2692          mutex_exit(&zp->z_lock);
2688 2693  
2689 2694          sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2690 2695  
2691 2696          if (zp->z_blksz == 0) {
2692 2697                  /*
2693 2698                   * Block size hasn't been set; suggest maximal I/O transfers.
2694 2699                   */
2695 2700                  vap->va_blksize = zfsvfs->z_max_blksz;
2696 2701          }
2697 2702  
2698 2703          ZFS_EXIT(zfsvfs);
2699 2704          return (0);
2700 2705  }
2701 2706  
2702 2707  /*
2703 2708   * Set the file attributes to the values contained in the
2704 2709   * vattr structure.
2705 2710   *
2706 2711   *      IN:     vp      - vnode of file to be modified.
2707 2712   *              vap     - new attribute values.
2708 2713   *                        If AT_XVATTR set, then optional attrs are being set
2709 2714   *              flags   - ATTR_UTIME set if non-default time values provided.
2710 2715   *                      - ATTR_NOACLCHECK (CIFS context only).
2711 2716   *              cr      - credentials of caller.
2712 2717   *              ct      - caller context
2713 2718   *
2714 2719   *      RETURN: 0 on success, error code on failure.
2715 2720   *
2716 2721   * Timestamps:
2717 2722   *      vp - ctime updated, mtime updated if size changed.
2718 2723   */
2719 2724  /* ARGSUSED */
2720 2725  static int
2721 2726  zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2722 2727      caller_context_t *ct)
2723 2728  {
2724 2729          znode_t         *zp = VTOZ(vp);
2725 2730          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2726 2731          zilog_t         *zilog;
2727 2732          dmu_tx_t        *tx;
2728 2733          vattr_t         oldva;
2729 2734          xvattr_t        tmpxvattr;
2730 2735          uint_t          mask = vap->va_mask;
2731 2736          uint_t          saved_mask = 0;
2732 2737          int             trim_mask = 0;
2733 2738          uint64_t        new_mode;
2734 2739          uint64_t        new_uid, new_gid;
2735 2740          uint64_t        xattr_obj;
2736 2741          uint64_t        mtime[2], ctime[2];
2737 2742          znode_t         *attrzp;
2738 2743          int             need_policy = FALSE;
2739 2744          int             err, err2;
2740 2745          zfs_fuid_info_t *fuidp = NULL;
2741 2746          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2742 2747          xoptattr_t      *xoap;
2743 2748          zfs_acl_t       *aclp;
2744 2749          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2745 2750          boolean_t       fuid_dirtied = B_FALSE;
2746 2751          sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2747 2752          int             count = 0, xattr_count = 0;
2748 2753  
2749 2754          if (mask == 0)
2750 2755                  return (0);
2751 2756  
2752 2757          if (mask & AT_NOSET)
2753 2758                  return (SET_ERROR(EINVAL));
2754 2759  
2755 2760          ZFS_ENTER(zfsvfs);
2756 2761          ZFS_VERIFY_ZP(zp);
2757 2762  
2758 2763          zilog = zfsvfs->z_log;
2759 2764  
2760 2765          /*
2761 2766           * Make sure that if we have ephemeral uid/gid or xvattr specified
2762 2767           * that file system is at proper version level
2763 2768           */
2764 2769  
2765 2770          if (zfsvfs->z_use_fuids == B_FALSE &&
2766 2771              (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2767 2772              ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2768 2773              (mask & AT_XVATTR))) {
2769 2774                  ZFS_EXIT(zfsvfs);
2770 2775                  return (SET_ERROR(EINVAL));
2771 2776          }
2772 2777  
2773 2778          if (mask & AT_SIZE && vp->v_type == VDIR) {
2774 2779                  ZFS_EXIT(zfsvfs);
2775 2780                  return (SET_ERROR(EISDIR));
2776 2781          }
2777 2782  
2778 2783          if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2779 2784                  ZFS_EXIT(zfsvfs);
2780 2785                  return (SET_ERROR(EINVAL));
2781 2786          }
2782 2787  
2783 2788          /*
2784 2789           * If this is an xvattr_t, then get a pointer to the structure of
2785 2790           * optional attributes.  If this is NULL, then we have a vattr_t.
2786 2791           */
2787 2792          xoap = xva_getxoptattr(xvap);
2788 2793  
2789 2794          xva_init(&tmpxvattr);
2790 2795  
2791 2796          /*
2792 2797           * Immutable files can only alter immutable bit and atime
2793 2798           */
2794 2799          if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2795 2800              ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2796 2801              ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2797 2802                  ZFS_EXIT(zfsvfs);
2798 2803                  return (SET_ERROR(EPERM));
2799 2804          }
2800 2805  
2801 2806          /*
2802 2807           * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2803 2808           */
2804 2809  
2805 2810          /*
2806 2811           * Verify timestamps doesn't overflow 32 bits.
2807 2812           * ZFS can handle large timestamps, but 32bit syscalls can't
2808 2813           * handle times greater than 2039.  This check should be removed
2809 2814           * once large timestamps are fully supported.
2810 2815           */
2811 2816          if (mask & (AT_ATIME | AT_MTIME)) {
2812 2817                  if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2813 2818                      ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2814 2819                          ZFS_EXIT(zfsvfs);
2815 2820                          return (SET_ERROR(EOVERFLOW));
2816 2821                  }
2817 2822          }
2818 2823  
2819 2824  top:
2820 2825          attrzp = NULL;
2821 2826          aclp = NULL;
2822 2827  
2823 2828          /* Can this be moved to before the top label? */
2824 2829          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2825 2830                  ZFS_EXIT(zfsvfs);
2826 2831                  return (SET_ERROR(EROFS));
2827 2832          }
2828 2833  
2829 2834          /*
2830 2835           * First validate permissions
2831 2836           */
2832 2837  
2833 2838          if (mask & AT_SIZE) {
2834 2839                  err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2835 2840                  if (err) {
2836 2841                          ZFS_EXIT(zfsvfs);
2837 2842                          return (err);
2838 2843                  }
2839 2844                  /*
2840 2845                   * XXX - Note, we are not providing any open
2841 2846                   * mode flags here (like FNDELAY), so we may
2842 2847                   * block if there are locks present... this
2843 2848                   * should be addressed in openat().
2844 2849                   */
2845 2850                  /* XXX - would it be OK to generate a log record here? */
2846 2851                  err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2847 2852                  if (err) {
2848 2853                          ZFS_EXIT(zfsvfs);
2849 2854                          return (err);
2850 2855                  }
2851 2856  
2852 2857                  if (vap->va_size == 0)
2853 2858                          vnevent_truncate(ZTOV(zp), ct);
2854 2859          }
2855 2860  
2856 2861          if (mask & (AT_ATIME|AT_MTIME) ||
2857 2862              ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2858 2863              XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2859 2864              XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2860 2865              XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2861 2866              XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2862 2867              XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2863 2868              XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2864 2869                  need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2865 2870                      skipaclchk, cr);
2866 2871          }
2867 2872  
2868 2873          if (mask & (AT_UID|AT_GID)) {
2869 2874                  int     idmask = (mask & (AT_UID|AT_GID));
2870 2875                  int     take_owner;
2871 2876                  int     take_group;
2872 2877  
2873 2878                  /*
2874 2879                   * NOTE: even if a new mode is being set,
2875 2880                   * we may clear S_ISUID/S_ISGID bits.
2876 2881                   */
2877 2882  
2878 2883                  if (!(mask & AT_MODE))
2879 2884                          vap->va_mode = zp->z_mode;
2880 2885  
2881 2886                  /*
2882 2887                   * Take ownership or chgrp to group we are a member of
2883 2888                   */
2884 2889  
2885 2890                  take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2886 2891                  take_group = (mask & AT_GID) &&
2887 2892                      zfs_groupmember(zfsvfs, vap->va_gid, cr);
2888 2893  
2889 2894                  /*
2890 2895                   * If both AT_UID and AT_GID are set then take_owner and
2891 2896                   * take_group must both be set in order to allow taking
2892 2897                   * ownership.
2893 2898                   *
2894 2899                   * Otherwise, send the check through secpolicy_vnode_setattr()
2895 2900                   *
2896 2901                   */
2897 2902  
2898 2903                  if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2899 2904                      ((idmask == AT_UID) && take_owner) ||
2900 2905                      ((idmask == AT_GID) && take_group)) {
2901 2906                          if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2902 2907                              skipaclchk, cr) == 0) {
2903 2908                                  /*
2904 2909                                   * Remove setuid/setgid for non-privileged users
2905 2910                                   */
2906 2911                                  secpolicy_setid_clear(vap, cr);
2907 2912                                  trim_mask = (mask & (AT_UID|AT_GID));
2908 2913                          } else {
2909 2914                                  need_policy =  TRUE;
2910 2915                          }
2911 2916                  } else {
2912 2917                          need_policy =  TRUE;
2913 2918                  }
2914 2919          }
2915 2920  
2916 2921          mutex_enter(&zp->z_lock);
2917 2922          oldva.va_mode = zp->z_mode;
2918 2923          zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2919 2924          if (mask & AT_XVATTR) {
2920 2925                  /*
2921 2926                   * Update xvattr mask to include only those attributes
2922 2927                   * that are actually changing.
2923 2928                   *
2924 2929                   * the bits will be restored prior to actually setting
2925 2930                   * the attributes so the caller thinks they were set.
2926 2931                   */
2927 2932                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2928 2933                          if (xoap->xoa_appendonly !=
2929 2934                              ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2930 2935                                  need_policy = TRUE;
2931 2936                          } else {
2932 2937                                  XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2933 2938                                  XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2934 2939                          }
2935 2940                  }
2936 2941  
2937 2942                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2938 2943                          if (xoap->xoa_nounlink !=
2939 2944                              ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2940 2945                                  need_policy = TRUE;
2941 2946                          } else {
2942 2947                                  XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2943 2948                                  XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2944 2949                          }
2945 2950                  }
2946 2951  
2947 2952                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2948 2953                          if (xoap->xoa_immutable !=
2949 2954                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2950 2955                                  need_policy = TRUE;
2951 2956                          } else {
2952 2957                                  XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2953 2958                                  XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2954 2959                          }
2955 2960                  }
2956 2961  
2957 2962                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2958 2963                          if (xoap->xoa_nodump !=
2959 2964                              ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2960 2965                                  need_policy = TRUE;
2961 2966                          } else {
2962 2967                                  XVA_CLR_REQ(xvap, XAT_NODUMP);
2963 2968                                  XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2964 2969                          }
2965 2970                  }
2966 2971  
2967 2972                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2968 2973                          if (xoap->xoa_av_modified !=
2969 2974                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2970 2975                                  need_policy = TRUE;
2971 2976                          } else {
2972 2977                                  XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2973 2978                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2974 2979                          }
2975 2980                  }
2976 2981  
2977 2982                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2978 2983                          if ((vp->v_type != VREG &&
2979 2984                              xoap->xoa_av_quarantined) ||
2980 2985                              xoap->xoa_av_quarantined !=
2981 2986                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2982 2987                                  need_policy = TRUE;
2983 2988                          } else {
2984 2989                                  XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2985 2990                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2986 2991                          }
2987 2992                  }
2988 2993  
2989 2994                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2990 2995                          mutex_exit(&zp->z_lock);
2991 2996                          ZFS_EXIT(zfsvfs);
2992 2997                          return (SET_ERROR(EPERM));
2993 2998                  }
2994 2999  
2995 3000                  if (need_policy == FALSE &&
2996 3001                      (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2997 3002                      XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2998 3003                          need_policy = TRUE;
2999 3004                  }
3000 3005          }
3001 3006  
3002 3007          mutex_exit(&zp->z_lock);
3003 3008  
3004 3009          if (mask & AT_MODE) {
3005 3010                  if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3006 3011                          err = secpolicy_setid_setsticky_clear(vp, vap,
3007 3012                              &oldva, cr);
3008 3013                          if (err) {
3009 3014                                  ZFS_EXIT(zfsvfs);
3010 3015                                  return (err);
3011 3016                          }
3012 3017                          trim_mask |= AT_MODE;
3013 3018                  } else {
3014 3019                          need_policy = TRUE;
3015 3020                  }
3016 3021          }
3017 3022  
3018 3023          if (need_policy) {
3019 3024                  /*
3020 3025                   * If trim_mask is set then take ownership
3021 3026                   * has been granted or write_acl is present and user
3022 3027                   * has the ability to modify mode.  In that case remove
3023 3028                   * UID|GID and or MODE from mask so that
3024 3029                   * secpolicy_vnode_setattr() doesn't revoke it.
3025 3030                   */
3026 3031  
3027 3032                  if (trim_mask) {
3028 3033                          saved_mask = vap->va_mask;
3029 3034                          vap->va_mask &= ~trim_mask;
3030 3035                  }
3031 3036                  err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3032 3037                      (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3033 3038                  if (err) {
3034 3039                          ZFS_EXIT(zfsvfs);
3035 3040                          return (err);
3036 3041                  }
3037 3042  
3038 3043                  if (trim_mask)
3039 3044                          vap->va_mask |= saved_mask;
3040 3045          }
3041 3046  
3042 3047          /*
3043 3048           * secpolicy_vnode_setattr, or take ownership may have
3044 3049           * changed va_mask
3045 3050           */
3046 3051          mask = vap->va_mask;
3047 3052  
3048 3053          if ((mask & (AT_UID | AT_GID))) {
3049 3054                  err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3050 3055                      &xattr_obj, sizeof (xattr_obj));
3051 3056  
3052 3057                  if (err == 0 && xattr_obj) {
3053 3058                          err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3054 3059                          if (err)
3055 3060                                  goto out2;
3056 3061                  }
3057 3062                  if (mask & AT_UID) {
3058 3063                          new_uid = zfs_fuid_create(zfsvfs,
3059 3064                              (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3060 3065                          if (new_uid != zp->z_uid &&
3061 3066                              zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3062 3067                                  if (attrzp)
3063 3068                                          VN_RELE(ZTOV(attrzp));
3064 3069                                  err = SET_ERROR(EDQUOT);
3065 3070                                  goto out2;
3066 3071                          }
3067 3072                  }
3068 3073  
3069 3074                  if (mask & AT_GID) {
3070 3075                          new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3071 3076                              cr, ZFS_GROUP, &fuidp);
3072 3077                          if (new_gid != zp->z_gid &&
3073 3078                              zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3074 3079                                  if (attrzp)
3075 3080                                          VN_RELE(ZTOV(attrzp));
3076 3081                                  err = SET_ERROR(EDQUOT);
3077 3082                                  goto out2;
3078 3083                          }
3079 3084                  }
3080 3085          }
3081 3086          tx = dmu_tx_create(zfsvfs->z_os);
3082 3087  
3083 3088          if (mask & AT_MODE) {
3084 3089                  uint64_t pmode = zp->z_mode;
3085 3090                  uint64_t acl_obj;
3086 3091                  new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3087 3092  
3088 3093                  if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3089 3094                      !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3090 3095                          err = SET_ERROR(EPERM);
3091 3096                          goto out;
3092 3097                  }
3093 3098  
3094 3099                  if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3095 3100                          goto out;
3096 3101  
3097 3102                  mutex_enter(&zp->z_lock);
3098 3103                  if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3099 3104                          /*
3100 3105                           * Are we upgrading ACL from old V0 format
3101 3106                           * to V1 format?
3102 3107                           */
3103 3108                          if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3104 3109                              zfs_znode_acl_version(zp) ==
3105 3110                              ZFS_ACL_VERSION_INITIAL) {
3106 3111                                  dmu_tx_hold_free(tx, acl_obj, 0,
3107 3112                                      DMU_OBJECT_END);
3108 3113                                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3109 3114                                      0, aclp->z_acl_bytes);
3110 3115                          } else {
3111 3116                                  dmu_tx_hold_write(tx, acl_obj, 0,
3112 3117                                      aclp->z_acl_bytes);
3113 3118                          }
3114 3119                  } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3115 3120                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3116 3121                              0, aclp->z_acl_bytes);
3117 3122                  }
3118 3123                  mutex_exit(&zp->z_lock);
3119 3124                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3120 3125          } else {
3121 3126                  if ((mask & AT_XVATTR) &&
3122 3127                      XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3123 3128                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3124 3129                  else
3125 3130                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3126 3131          }
3127 3132  
3128 3133          if (attrzp) {
3129 3134                  dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3130 3135          }
3131 3136  
3132 3137          fuid_dirtied = zfsvfs->z_fuid_dirty;
3133 3138          if (fuid_dirtied)
3134 3139                  zfs_fuid_txhold(zfsvfs, tx);
3135 3140  
3136 3141          zfs_sa_upgrade_txholds(tx, zp);
3137 3142  
3138 3143          err = dmu_tx_assign(tx, TXG_WAIT);
3139 3144          if (err)
3140 3145                  goto out;
3141 3146  
3142 3147          count = 0;
3143 3148          /*
3144 3149           * Set each attribute requested.
3145 3150           * We group settings according to the locks they need to acquire.
3146 3151           *
3147 3152           * Note: you cannot set ctime directly, although it will be
3148 3153           * updated as a side-effect of calling this function.
3149 3154           */
3150 3155  
3151 3156  
3152 3157          if (mask & (AT_UID|AT_GID|AT_MODE))
3153 3158                  mutex_enter(&zp->z_acl_lock);
3154 3159          mutex_enter(&zp->z_lock);
3155 3160  
3156 3161          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3157 3162              &zp->z_pflags, sizeof (zp->z_pflags));
3158 3163  
3159 3164          if (attrzp) {
3160 3165                  if (mask & (AT_UID|AT_GID|AT_MODE))
3161 3166                          mutex_enter(&attrzp->z_acl_lock);
3162 3167                  mutex_enter(&attrzp->z_lock);
3163 3168                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3164 3169                      SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3165 3170                      sizeof (attrzp->z_pflags));
3166 3171          }
3167 3172  
3168 3173          if (mask & (AT_UID|AT_GID)) {
3169 3174  
3170 3175                  if (mask & AT_UID) {
3171 3176                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3172 3177                              &new_uid, sizeof (new_uid));
3173 3178                          zp->z_uid = new_uid;
3174 3179                          if (attrzp) {
3175 3180                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3176 3181                                      SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3177 3182                                      sizeof (new_uid));
3178 3183                                  attrzp->z_uid = new_uid;
3179 3184                          }
3180 3185                  }
3181 3186  
3182 3187                  if (mask & AT_GID) {
3183 3188                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3184 3189                              NULL, &new_gid, sizeof (new_gid));
3185 3190                          zp->z_gid = new_gid;
3186 3191                          if (attrzp) {
3187 3192                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3188 3193                                      SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3189 3194                                      sizeof (new_gid));
3190 3195                                  attrzp->z_gid = new_gid;
3191 3196                          }
3192 3197                  }
3193 3198                  if (!(mask & AT_MODE)) {
3194 3199                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3195 3200                              NULL, &new_mode, sizeof (new_mode));
3196 3201                          new_mode = zp->z_mode;
3197 3202                  }
3198 3203                  err = zfs_acl_chown_setattr(zp);
3199 3204                  ASSERT(err == 0);
3200 3205                  if (attrzp) {
3201 3206                          err = zfs_acl_chown_setattr(attrzp);
3202 3207                          ASSERT(err == 0);
3203 3208                  }
3204 3209          }
3205 3210  
3206 3211          if (mask & AT_MODE) {
3207 3212                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3208 3213                      &new_mode, sizeof (new_mode));
3209 3214                  zp->z_mode = new_mode;
3210 3215                  ASSERT3U((uintptr_t)aclp, !=, NULL);
3211 3216                  err = zfs_aclset_common(zp, aclp, cr, tx);
3212 3217                  ASSERT0(err);
3213 3218                  if (zp->z_acl_cached)
3214 3219                          zfs_acl_free(zp->z_acl_cached);
3215 3220                  zp->z_acl_cached = aclp;
3216 3221                  aclp = NULL;
3217 3222          }
3218 3223  
3219 3224  
3220 3225          if (mask & AT_ATIME) {
3221 3226                  ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3222 3227                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3223 3228                      &zp->z_atime, sizeof (zp->z_atime));
3224 3229          }
3225 3230  
3226 3231          if (mask & AT_MTIME) {
3227 3232                  ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3228 3233                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3229 3234                      mtime, sizeof (mtime));
3230 3235          }
3231 3236  
3232 3237          /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3233 3238          if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3234 3239                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3235 3240                      NULL, mtime, sizeof (mtime));
3236 3241                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3237 3242                      &ctime, sizeof (ctime));
3238 3243                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3239 3244                      B_TRUE);
3240 3245          } else if (mask != 0) {
3241 3246                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3242 3247                      &ctime, sizeof (ctime));
3243 3248                  zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3244 3249                      B_TRUE);
3245 3250                  if (attrzp) {
3246 3251                          SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3247 3252                              SA_ZPL_CTIME(zfsvfs), NULL,
3248 3253                              &ctime, sizeof (ctime));
3249 3254                          zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3250 3255                              mtime, ctime, B_TRUE);
3251 3256                  }
3252 3257          }
3253 3258          /*
3254 3259           * Do this after setting timestamps to prevent timestamp
3255 3260           * update from toggling bit
3256 3261           */
3257 3262  
3258 3263          if (xoap && (mask & AT_XVATTR)) {
3259 3264  
3260 3265                  /*
3261 3266                   * restore trimmed off masks
3262 3267                   * so that return masks can be set for caller.
3263 3268                   */
3264 3269  
3265 3270                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3266 3271                          XVA_SET_REQ(xvap, XAT_APPENDONLY);
3267 3272                  }
3268 3273                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3269 3274                          XVA_SET_REQ(xvap, XAT_NOUNLINK);
3270 3275                  }
3271 3276                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3272 3277                          XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3273 3278                  }
3274 3279                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3275 3280                          XVA_SET_REQ(xvap, XAT_NODUMP);
3276 3281                  }
3277 3282                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3278 3283                          XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3279 3284                  }
3280 3285                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3281 3286                          XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3282 3287                  }
3283 3288  
3284 3289                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3285 3290                          ASSERT(vp->v_type == VREG);
3286 3291  
3287 3292                  zfs_xvattr_set(zp, xvap, tx);
3288 3293          }
3289 3294  
3290 3295          if (fuid_dirtied)
3291 3296                  zfs_fuid_sync(zfsvfs, tx);
3292 3297  
3293 3298          if (mask != 0)
3294 3299                  zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3295 3300  
3296 3301          mutex_exit(&zp->z_lock);
3297 3302          if (mask & (AT_UID|AT_GID|AT_MODE))
3298 3303                  mutex_exit(&zp->z_acl_lock);
3299 3304  
3300 3305          if (attrzp) {
3301 3306                  if (mask & (AT_UID|AT_GID|AT_MODE))
3302 3307                          mutex_exit(&attrzp->z_acl_lock);
3303 3308                  mutex_exit(&attrzp->z_lock);
3304 3309          }
3305 3310  out:
3306 3311          if (err == 0 && attrzp) {
3307 3312                  err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3308 3313                      xattr_count, tx);
3309 3314                  ASSERT(err2 == 0);
3310 3315          }
3311 3316  
3312 3317          if (attrzp)
3313 3318                  VN_RELE(ZTOV(attrzp));
3314 3319  
3315 3320          if (aclp)
3316 3321                  zfs_acl_free(aclp);
3317 3322  
3318 3323          if (fuidp) {
3319 3324                  zfs_fuid_info_free(fuidp);
3320 3325                  fuidp = NULL;
3321 3326          }
3322 3327  
3323 3328          if (err) {
3324 3329                  dmu_tx_abort(tx);
3325 3330                  if (err == ERESTART)
3326 3331                          goto top;
3327 3332          } else {
3328 3333                  err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3329 3334                  dmu_tx_commit(tx);
3330 3335          }
3331 3336  
3332 3337  out2:
3333 3338          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3334 3339                  zil_commit(zilog, 0);
3335 3340  
3336 3341          ZFS_EXIT(zfsvfs);
3337 3342          return (err);
3338 3343  }
3339 3344  
3340 3345  typedef struct zfs_zlock {
3341 3346          krwlock_t       *zl_rwlock;     /* lock we acquired */
3342 3347          znode_t         *zl_znode;      /* znode we held */
3343 3348          struct zfs_zlock *zl_next;      /* next in list */
3344 3349  } zfs_zlock_t;
3345 3350  
3346 3351  /*
3347 3352   * Drop locks and release vnodes that were held by zfs_rename_lock().
3348 3353   */
3349 3354  static void
3350 3355  zfs_rename_unlock(zfs_zlock_t **zlpp)
3351 3356  {
3352 3357          zfs_zlock_t *zl;
3353 3358  
3354 3359          while ((zl = *zlpp) != NULL) {
3355 3360                  if (zl->zl_znode != NULL)
3356 3361                          VN_RELE(ZTOV(zl->zl_znode));
3357 3362                  rw_exit(zl->zl_rwlock);
3358 3363                  *zlpp = zl->zl_next;
3359 3364                  kmem_free(zl, sizeof (*zl));
3360 3365          }
3361 3366  }
3362 3367  
3363 3368  /*
3364 3369   * Search back through the directory tree, using the ".." entries.
3365 3370   * Lock each directory in the chain to prevent concurrent renames.
3366 3371   * Fail any attempt to move a directory into one of its own descendants.
3367 3372   * XXX - z_parent_lock can overlap with map or grow locks
3368 3373   */
3369 3374  static int
3370 3375  zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3371 3376  {
3372 3377          zfs_zlock_t     *zl;
3373 3378          znode_t         *zp = tdzp;
3374 3379          uint64_t        rootid = zp->z_zfsvfs->z_root;
3375 3380          uint64_t        oidp = zp->z_id;
3376 3381          krwlock_t       *rwlp = &szp->z_parent_lock;
3377 3382          krw_t           rw = RW_WRITER;
3378 3383  
3379 3384          /*
3380 3385           * First pass write-locks szp and compares to zp->z_id.
3381 3386           * Later passes read-lock zp and compare to zp->z_parent.
3382 3387           */
3383 3388          do {
3384 3389                  if (!rw_tryenter(rwlp, rw)) {
3385 3390                          /*
3386 3391                           * Another thread is renaming in this path.
3387 3392                           * Note that if we are a WRITER, we don't have any
3388 3393                           * parent_locks held yet.
3389 3394                           */
3390 3395                          if (rw == RW_READER && zp->z_id > szp->z_id) {
3391 3396                                  /*
3392 3397                                   * Drop our locks and restart
3393 3398                                   */
3394 3399                                  zfs_rename_unlock(&zl);
3395 3400                                  *zlpp = NULL;
3396 3401                                  zp = tdzp;
3397 3402                                  oidp = zp->z_id;
3398 3403                                  rwlp = &szp->z_parent_lock;
3399 3404                                  rw = RW_WRITER;
3400 3405                                  continue;
3401 3406                          } else {
3402 3407                                  /*
3403 3408                                   * Wait for other thread to drop its locks
3404 3409                                   */
3405 3410                                  rw_enter(rwlp, rw);
3406 3411                          }
3407 3412                  }
3408 3413  
3409 3414                  zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3410 3415                  zl->zl_rwlock = rwlp;
3411 3416                  zl->zl_znode = NULL;
3412 3417                  zl->zl_next = *zlpp;
3413 3418                  *zlpp = zl;
3414 3419  
3415 3420                  if (oidp == szp->z_id)          /* We're a descendant of szp */
3416 3421                          return (SET_ERROR(EINVAL));
3417 3422  
3418 3423                  if (oidp == rootid)             /* We've hit the top */
3419 3424                          return (0);
3420 3425  
3421 3426                  if (rw == RW_READER) {          /* i.e. not the first pass */
3422 3427                          int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3423 3428                          if (error)
3424 3429                                  return (error);
3425 3430                          zl->zl_znode = zp;
3426 3431                  }
3427 3432                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3428 3433                      &oidp, sizeof (oidp));
3429 3434                  rwlp = &zp->z_parent_lock;
3430 3435                  rw = RW_READER;
3431 3436  
3432 3437          } while (zp->z_id != sdzp->z_id);
3433 3438  
3434 3439          return (0);
3435 3440  }
3436 3441  
3437 3442  /*
3438 3443   * Move an entry from the provided source directory to the target
3439 3444   * directory.  Change the entry name as indicated.
3440 3445   *
3441 3446   *      IN:     sdvp    - Source directory containing the "old entry".
3442 3447   *              snm     - Old entry name.
3443 3448   *              tdvp    - Target directory to contain the "new entry".
3444 3449   *              tnm     - New entry name.
3445 3450   *              cr      - credentials of caller.
3446 3451   *              ct      - caller context
3447 3452   *              flags   - case flags
3448 3453   *
3449 3454   *      RETURN: 0 on success, error code on failure.
3450 3455   *
3451 3456   * Timestamps:
3452 3457   *      sdvp,tdvp - ctime|mtime updated
3453 3458   */
3454 3459  /*ARGSUSED*/
3455 3460  static int
3456 3461  zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3457 3462      caller_context_t *ct, int flags)
3458 3463  {
3459 3464          znode_t         *tdzp, *szp, *tzp;
3460 3465          znode_t         *sdzp = VTOZ(sdvp);
3461 3466          zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3462 3467          zilog_t         *zilog;
3463 3468          vnode_t         *realvp;
3464 3469          zfs_dirlock_t   *sdl, *tdl;
3465 3470          dmu_tx_t        *tx;
3466 3471          zfs_zlock_t     *zl;
3467 3472          int             cmp, serr, terr;
3468 3473          int             error = 0, rm_err = 0;
3469 3474          int             zflg = 0;
3470 3475          boolean_t       waited = B_FALSE;
3471 3476  
3472 3477          ZFS_ENTER(zfsvfs);
3473 3478          ZFS_VERIFY_ZP(sdzp);
3474 3479          zilog = zfsvfs->z_log;
3475 3480  
3476 3481          /*
3477 3482           * Make sure we have the real vp for the target directory.
3478 3483           */
3479 3484          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3480 3485                  tdvp = realvp;
3481 3486  
3482 3487          tdzp = VTOZ(tdvp);
3483 3488          ZFS_VERIFY_ZP(tdzp);
3484 3489  
3485 3490          /*
3486 3491           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3487 3492           * ctldir appear to have the same v_vfsp.
3488 3493           */
3489 3494          if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3490 3495                  ZFS_EXIT(zfsvfs);
3491 3496                  return (SET_ERROR(EXDEV));
3492 3497          }
3493 3498  
3494 3499          if (zfsvfs->z_utf8 && u8_validate(tnm,
3495 3500              strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3496 3501                  ZFS_EXIT(zfsvfs);
3497 3502                  return (SET_ERROR(EILSEQ));
3498 3503          }
3499 3504  
3500 3505          if (flags & FIGNORECASE)
3501 3506                  zflg |= ZCILOOK;
3502 3507  
3503 3508  top:
3504 3509          szp = NULL;
3505 3510          tzp = NULL;
3506 3511          zl = NULL;
3507 3512  
3508 3513          /*
3509 3514           * This is to prevent the creation of links into attribute space
3510 3515           * by renaming a linked file into/outof an attribute directory.
3511 3516           * See the comment in zfs_link() for why this is considered bad.
3512 3517           */
3513 3518          if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3514 3519                  ZFS_EXIT(zfsvfs);
3515 3520                  return (SET_ERROR(EINVAL));
3516 3521          }
3517 3522  
3518 3523          /*
3519 3524           * Lock source and target directory entries.  To prevent deadlock,
3520 3525           * a lock ordering must be defined.  We lock the directory with
3521 3526           * the smallest object id first, or if it's a tie, the one with
3522 3527           * the lexically first name.
3523 3528           */
3524 3529          if (sdzp->z_id < tdzp->z_id) {
3525 3530                  cmp = -1;
3526 3531          } else if (sdzp->z_id > tdzp->z_id) {
3527 3532                  cmp = 1;
3528 3533          } else {
3529 3534                  /*
3530 3535                   * First compare the two name arguments without
3531 3536                   * considering any case folding.
3532 3537                   */
3533 3538                  int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3534 3539  
3535 3540                  cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3536 3541                  ASSERT(error == 0 || !zfsvfs->z_utf8);
3537 3542                  if (cmp == 0) {
3538 3543                          /*
3539 3544                           * POSIX: "If the old argument and the new argument
3540 3545                           * both refer to links to the same existing file,
3541 3546                           * the rename() function shall return successfully
3542 3547                           * and perform no other action."
3543 3548                           */
3544 3549                          ZFS_EXIT(zfsvfs);
3545 3550                          return (0);
3546 3551                  }
3547 3552                  /*
3548 3553                   * If the file system is case-folding, then we may
3549 3554                   * have some more checking to do.  A case-folding file
3550 3555                   * system is either supporting mixed case sensitivity
3551 3556                   * access or is completely case-insensitive.  Note
3552 3557                   * that the file system is always case preserving.
3553 3558                   *
3554 3559                   * In mixed sensitivity mode case sensitive behavior
3555 3560                   * is the default.  FIGNORECASE must be used to
3556 3561                   * explicitly request case insensitive behavior.
3557 3562                   *
3558 3563                   * If the source and target names provided differ only
3559 3564                   * by case (e.g., a request to rename 'tim' to 'Tim'),
3560 3565                   * we will treat this as a special case in the
3561 3566                   * case-insensitive mode: as long as the source name
3562 3567                   * is an exact match, we will allow this to proceed as
3563 3568                   * a name-change request.
3564 3569                   */
3565 3570                  if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3566 3571                      (zfsvfs->z_case == ZFS_CASE_MIXED &&
3567 3572                      flags & FIGNORECASE)) &&
3568 3573                      u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3569 3574                      &error) == 0) {
3570 3575                          /*
3571 3576                           * case preserving rename request, require exact
3572 3577                           * name matches
3573 3578                           */
3574 3579                          zflg |= ZCIEXACT;
3575 3580                          zflg &= ~ZCILOOK;
3576 3581                  }
3577 3582          }
3578 3583  
3579 3584          /*
3580 3585           * If the source and destination directories are the same, we should
3581 3586           * grab the z_name_lock of that directory only once.
3582 3587           */
3583 3588          if (sdzp == tdzp) {
3584 3589                  zflg |= ZHAVELOCK;
3585 3590                  rw_enter(&sdzp->z_name_lock, RW_READER);
3586 3591          }
3587 3592  
3588 3593          if (cmp < 0) {
3589 3594                  serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3590 3595                      ZEXISTS | zflg, NULL, NULL);
3591 3596                  terr = zfs_dirent_lock(&tdl,
3592 3597                      tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3593 3598          } else {
3594 3599                  terr = zfs_dirent_lock(&tdl,
3595 3600                      tdzp, tnm, &tzp, zflg, NULL, NULL);
3596 3601                  serr = zfs_dirent_lock(&sdl,
3597 3602                      sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3598 3603                      NULL, NULL);
3599 3604          }
3600 3605  
3601 3606          if (serr) {
3602 3607                  /*
3603 3608                   * Source entry invalid or not there.
3604 3609                   */
3605 3610                  if (!terr) {
3606 3611                          zfs_dirent_unlock(tdl);
3607 3612                          if (tzp)
3608 3613                                  VN_RELE(ZTOV(tzp));
3609 3614                  }
3610 3615  
3611 3616                  if (sdzp == tdzp)
3612 3617                          rw_exit(&sdzp->z_name_lock);
3613 3618  
3614 3619                  if (strcmp(snm, "..") == 0)
3615 3620                          serr = SET_ERROR(EINVAL);
3616 3621                  ZFS_EXIT(zfsvfs);
3617 3622                  return (serr);
3618 3623          }
3619 3624          if (terr) {
3620 3625                  zfs_dirent_unlock(sdl);
3621 3626                  VN_RELE(ZTOV(szp));
3622 3627  
3623 3628                  if (sdzp == tdzp)
3624 3629                          rw_exit(&sdzp->z_name_lock);
3625 3630  
3626 3631                  if (strcmp(tnm, "..") == 0)
3627 3632                          terr = SET_ERROR(EINVAL);
3628 3633                  ZFS_EXIT(zfsvfs);
3629 3634                  return (terr);
3630 3635          }
3631 3636  
3632 3637          /*
3633 3638           * Must have write access at the source to remove the old entry
3634 3639           * and write access at the target to create the new entry.
3635 3640           * Note that if target and source are the same, this can be
3636 3641           * done in a single check.
3637 3642           */
3638 3643  
3639 3644          if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3640 3645                  goto out;
3641 3646  
3642 3647          if (ZTOV(szp)->v_type == VDIR) {
3643 3648                  /*
3644 3649                   * Check to make sure rename is valid.
3645 3650                   * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3646 3651                   */
3647 3652                  if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3648 3653                          goto out;
3649 3654          }
3650 3655  
3651 3656          /*
3652 3657           * Does target exist?
3653 3658           */
3654 3659          if (tzp) {
3655 3660                  /*
3656 3661                   * Source and target must be the same type.
3657 3662                   */
3658 3663                  if (ZTOV(szp)->v_type == VDIR) {
3659 3664                          if (ZTOV(tzp)->v_type != VDIR) {
3660 3665                                  error = SET_ERROR(ENOTDIR);
3661 3666                                  goto out;
3662 3667                          }
3663 3668                  } else {
3664 3669                          if (ZTOV(tzp)->v_type == VDIR) {
3665 3670                                  error = SET_ERROR(EISDIR);
3666 3671                                  goto out;
3667 3672                          }
3668 3673                  }
3669 3674                  /*
3670 3675                   * POSIX dictates that when the source and target
3671 3676                   * entries refer to the same file object, rename
3672 3677                   * must do nothing and exit without error.
3673 3678                   */
3674 3679                  if (szp->z_id == tzp->z_id) {
3675 3680                          error = 0;
3676 3681                          goto out;
3677 3682                  }
3678 3683          }
3679 3684  
3680 3685          vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
3681 3686          if (tzp)
3682 3687                  vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3683 3688  
3684 3689          /*
3685 3690           * notify the target directory if it is not the same
3686 3691           * as source directory.
3687 3692           */
3688 3693          if (tdvp != sdvp) {
3689 3694                  vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3690 3695          }
3691 3696  
3692 3697          tx = dmu_tx_create(zfsvfs->z_os);
3693 3698          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3694 3699          dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3695 3700          dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3696 3701          dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3697 3702          if (sdzp != tdzp) {
3698 3703                  dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3699 3704                  zfs_sa_upgrade_txholds(tx, tdzp);
3700 3705          }
3701 3706          if (tzp) {
3702 3707                  dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3703 3708                  zfs_sa_upgrade_txholds(tx, tzp);
3704 3709          }
3705 3710  
3706 3711          zfs_sa_upgrade_txholds(tx, szp);
3707 3712          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3708 3713          error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3709 3714          if (error) {
3710 3715                  if (zl != NULL)
3711 3716                          zfs_rename_unlock(&zl);
3712 3717                  zfs_dirent_unlock(sdl);
3713 3718                  zfs_dirent_unlock(tdl);
3714 3719  
3715 3720                  if (sdzp == tdzp)
3716 3721                          rw_exit(&sdzp->z_name_lock);
3717 3722  
3718 3723                  VN_RELE(ZTOV(szp));
3719 3724                  if (tzp)
3720 3725                          VN_RELE(ZTOV(tzp));
3721 3726                  if (error == ERESTART) {
3722 3727                          waited = B_TRUE;
3723 3728                          dmu_tx_wait(tx);
3724 3729                          dmu_tx_abort(tx);
3725 3730                          goto top;
3726 3731                  }
3727 3732                  dmu_tx_abort(tx);
3728 3733                  ZFS_EXIT(zfsvfs);
3729 3734                  return (error);
3730 3735          }
3731 3736  
3732 3737          if (tzp)        /* Attempt to remove the existing target */
3733 3738                  error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3734 3739  
3735 3740          if (error == 0) {
3736 3741                  error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3737 3742                  if (error == 0) {
3738 3743                          szp->z_pflags |= ZFS_AV_MODIFIED;
3739 3744  
3740 3745                          error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3741 3746                              (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3742 3747                          ASSERT0(error);
3743 3748  
3744 3749                          error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3745 3750                          if (error == 0) {
3746 3751                                  zfs_log_rename(zilog, tx, TX_RENAME |
3747 3752                                      (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3748 3753                                      sdl->dl_name, tdzp, tdl->dl_name, szp);
3749 3754  
3750 3755                                  /*
3751 3756                                   * Update path information for the target vnode
3752 3757                                   */
3753 3758                                  vn_renamepath(tdvp, ZTOV(szp), tnm,
3754 3759                                      strlen(tnm));
3755 3760                          } else {
3756 3761                                  /*
3757 3762                                   * At this point, we have successfully created
3758 3763                                   * the target name, but have failed to remove
3759 3764                                   * the source name.  Since the create was done
3760 3765                                   * with the ZRENAMING flag, there are
3761 3766                                   * complications; for one, the link count is
3762 3767                                   * wrong.  The easiest way to deal with this
3763 3768                                   * is to remove the newly created target, and
3764 3769                                   * return the original error.  This must
3765 3770                                   * succeed; fortunately, it is very unlikely to
3766 3771                                   * fail, since we just created it.
3767 3772                                   */
3768 3773                                  VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3769 3774                                      ZRENAMING, NULL), ==, 0);
3770 3775                          }
3771 3776                  }
3772 3777          }
3773 3778  
3774 3779          dmu_tx_commit(tx);
3775 3780  
3776 3781          if (tzp && rm_err == 0)
3777 3782                  vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3778 3783  
3779 3784          if (error == 0) {
3780 3785                  vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3781 3786                  /* notify the target dir if it is not the same as source dir */
3782 3787                  if (tdvp != sdvp)
3783 3788                          vnevent_rename_dest_dir(tdvp, ct);
3784 3789          }
3785 3790  out:
3786 3791          if (zl != NULL)
3787 3792                  zfs_rename_unlock(&zl);
3788 3793  
3789 3794          zfs_dirent_unlock(sdl);
3790 3795          zfs_dirent_unlock(tdl);
3791 3796  
3792 3797          if (sdzp == tdzp)
3793 3798                  rw_exit(&sdzp->z_name_lock);
3794 3799  
3795 3800  
3796 3801          VN_RELE(ZTOV(szp));
3797 3802          if (tzp)
3798 3803                  VN_RELE(ZTOV(tzp));
3799 3804  
3800 3805          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3801 3806                  zil_commit(zilog, 0);
3802 3807  
3803 3808          ZFS_EXIT(zfsvfs);
3804 3809          return (error);
3805 3810  }
3806 3811  
3807 3812  /*
3808 3813   * Insert the indicated symbolic reference entry into the directory.
3809 3814   *
3810 3815   *      IN:     dvp     - Directory to contain new symbolic link.
3811 3816   *              link    - Name for new symlink entry.
3812 3817   *              vap     - Attributes of new entry.
3813 3818   *              cr      - credentials of caller.
3814 3819   *              ct      - caller context
3815 3820   *              flags   - case flags
3816 3821   *
3817 3822   *      RETURN: 0 on success, error code on failure.
3818 3823   *
3819 3824   * Timestamps:
3820 3825   *      dvp - ctime|mtime updated
3821 3826   */
3822 3827  /*ARGSUSED*/
3823 3828  static int
3824 3829  zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3825 3830      caller_context_t *ct, int flags)
3826 3831  {
3827 3832          znode_t         *zp, *dzp = VTOZ(dvp);
3828 3833          zfs_dirlock_t   *dl;
3829 3834          dmu_tx_t        *tx;
3830 3835          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3831 3836          zilog_t         *zilog;
3832 3837          uint64_t        len = strlen(link);
3833 3838          int             error;
3834 3839          int             zflg = ZNEW;
3835 3840          zfs_acl_ids_t   acl_ids;
3836 3841          boolean_t       fuid_dirtied;
3837 3842          uint64_t        txtype = TX_SYMLINK;
3838 3843          boolean_t       waited = B_FALSE;
3839 3844  
3840 3845          ASSERT(vap->va_type == VLNK);
3841 3846  
3842 3847          ZFS_ENTER(zfsvfs);
3843 3848          ZFS_VERIFY_ZP(dzp);
3844 3849          zilog = zfsvfs->z_log;
3845 3850  
3846 3851          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3847 3852              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3848 3853                  ZFS_EXIT(zfsvfs);
3849 3854                  return (SET_ERROR(EILSEQ));
3850 3855          }
3851 3856          if (flags & FIGNORECASE)
3852 3857                  zflg |= ZCILOOK;
3853 3858  
3854 3859          if (len > MAXPATHLEN) {
3855 3860                  ZFS_EXIT(zfsvfs);
3856 3861                  return (SET_ERROR(ENAMETOOLONG));
3857 3862          }
3858 3863  
3859 3864          if ((error = zfs_acl_ids_create(dzp, 0,
3860 3865              vap, cr, NULL, &acl_ids)) != 0) {
3861 3866                  ZFS_EXIT(zfsvfs);
3862 3867                  return (error);
3863 3868          }
3864 3869  top:
3865 3870          /*
3866 3871           * Attempt to lock directory; fail if entry already exists.
3867 3872           */
3868 3873          error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3869 3874          if (error) {
3870 3875                  zfs_acl_ids_free(&acl_ids);
3871 3876                  ZFS_EXIT(zfsvfs);
3872 3877                  return (error);
3873 3878          }
3874 3879  
3875 3880          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3876 3881                  zfs_acl_ids_free(&acl_ids);
3877 3882                  zfs_dirent_unlock(dl);
3878 3883                  ZFS_EXIT(zfsvfs);
3879 3884                  return (error);
3880 3885          }
3881 3886  
3882 3887          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3883 3888                  zfs_acl_ids_free(&acl_ids);
3884 3889                  zfs_dirent_unlock(dl);
3885 3890                  ZFS_EXIT(zfsvfs);
3886 3891                  return (SET_ERROR(EDQUOT));
3887 3892          }
3888 3893          tx = dmu_tx_create(zfsvfs->z_os);
3889 3894          fuid_dirtied = zfsvfs->z_fuid_dirty;
3890 3895          dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3891 3896          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3892 3897          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3893 3898              ZFS_SA_BASE_ATTR_SIZE + len);
3894 3899          dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3895 3900          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3896 3901                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3897 3902                      acl_ids.z_aclp->z_acl_bytes);
3898 3903          }
3899 3904          if (fuid_dirtied)
3900 3905                  zfs_fuid_txhold(zfsvfs, tx);
3901 3906          error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3902 3907          if (error) {
3903 3908                  zfs_dirent_unlock(dl);
3904 3909                  if (error == ERESTART) {
3905 3910                          waited = B_TRUE;
3906 3911                          dmu_tx_wait(tx);
3907 3912                          dmu_tx_abort(tx);
3908 3913                          goto top;
3909 3914                  }
3910 3915                  zfs_acl_ids_free(&acl_ids);
3911 3916                  dmu_tx_abort(tx);
3912 3917                  ZFS_EXIT(zfsvfs);
3913 3918                  return (error);
3914 3919          }
3915 3920  
3916 3921          /*
3917 3922           * Create a new object for the symlink.
3918 3923           * for version 4 ZPL datsets the symlink will be an SA attribute
3919 3924           */
3920 3925          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3921 3926  
3922 3927          if (fuid_dirtied)
3923 3928                  zfs_fuid_sync(zfsvfs, tx);
3924 3929  
3925 3930          mutex_enter(&zp->z_lock);
3926 3931          if (zp->z_is_sa)
3927 3932                  error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3928 3933                      link, len, tx);
3929 3934          else
3930 3935                  zfs_sa_symlink(zp, link, len, tx);
3931 3936          mutex_exit(&zp->z_lock);
3932 3937  
3933 3938          zp->z_size = len;
3934 3939          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3935 3940              &zp->z_size, sizeof (zp->z_size), tx);
3936 3941          /*
3937 3942           * Insert the new object into the directory.
3938 3943           */
3939 3944          (void) zfs_link_create(dl, zp, tx, ZNEW);
3940 3945  
3941 3946          if (flags & FIGNORECASE)
3942 3947                  txtype |= TX_CI;
3943 3948          zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3944 3949  
3945 3950          zfs_acl_ids_free(&acl_ids);
3946 3951  
3947 3952          dmu_tx_commit(tx);
3948 3953  
3949 3954          zfs_dirent_unlock(dl);
3950 3955  
3951 3956          VN_RELE(ZTOV(zp));
3952 3957  
3953 3958          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3954 3959                  zil_commit(zilog, 0);
3955 3960  
3956 3961          ZFS_EXIT(zfsvfs);
3957 3962          return (error);
3958 3963  }
3959 3964  
3960 3965  /*
3961 3966   * Return, in the buffer contained in the provided uio structure,
3962 3967   * the symbolic path referred to by vp.
3963 3968   *
3964 3969   *      IN:     vp      - vnode of symbolic link.
3965 3970   *              uio     - structure to contain the link path.
3966 3971   *              cr      - credentials of caller.
3967 3972   *              ct      - caller context
3968 3973   *
3969 3974   *      OUT:    uio     - structure containing the link path.
3970 3975   *
3971 3976   *      RETURN: 0 on success, error code on failure.
3972 3977   *
3973 3978   * Timestamps:
3974 3979   *      vp - atime updated
3975 3980   */
3976 3981  /* ARGSUSED */
3977 3982  static int
3978 3983  zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3979 3984  {
3980 3985          znode_t         *zp = VTOZ(vp);
3981 3986          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3982 3987          int             error;
3983 3988  
3984 3989          ZFS_ENTER(zfsvfs);
3985 3990          ZFS_VERIFY_ZP(zp);
3986 3991  
3987 3992          mutex_enter(&zp->z_lock);
3988 3993          if (zp->z_is_sa)
3989 3994                  error = sa_lookup_uio(zp->z_sa_hdl,
3990 3995                      SA_ZPL_SYMLINK(zfsvfs), uio);
3991 3996          else
3992 3997                  error = zfs_sa_readlink(zp, uio);
3993 3998          mutex_exit(&zp->z_lock);
3994 3999  
3995 4000          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3996 4001  
3997 4002          ZFS_EXIT(zfsvfs);
3998 4003          return (error);
3999 4004  }
4000 4005  
4001 4006  /*
4002 4007   * Insert a new entry into directory tdvp referencing svp.
4003 4008   *
4004 4009   *      IN:     tdvp    - Directory to contain new entry.
4005 4010   *              svp     - vnode of new entry.
4006 4011   *              name    - name of new entry.
4007 4012   *              cr      - credentials of caller.
4008 4013   *              ct      - caller context
4009 4014   *
4010 4015   *      RETURN: 0 on success, error code on failure.
4011 4016   *
4012 4017   * Timestamps:
4013 4018   *      tdvp - ctime|mtime updated
4014 4019   *       svp - ctime updated
4015 4020   */
4016 4021  /* ARGSUSED */
4017 4022  static int
4018 4023  zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4019 4024      caller_context_t *ct, int flags)
4020 4025  {
4021 4026          znode_t         *dzp = VTOZ(tdvp);
4022 4027          znode_t         *tzp, *szp;
4023 4028          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
4024 4029          zilog_t         *zilog;
4025 4030          zfs_dirlock_t   *dl;
4026 4031          dmu_tx_t        *tx;
4027 4032          vnode_t         *realvp;
4028 4033          int             error;
4029 4034          int             zf = ZNEW;
4030 4035          uint64_t        parent;
4031 4036          uid_t           owner;
4032 4037          boolean_t       waited = B_FALSE;
4033 4038  
4034 4039          ASSERT(tdvp->v_type == VDIR);
4035 4040  
4036 4041          ZFS_ENTER(zfsvfs);
4037 4042          ZFS_VERIFY_ZP(dzp);
4038 4043          zilog = zfsvfs->z_log;
4039 4044  
4040 4045          if (VOP_REALVP(svp, &realvp, ct) == 0)
4041 4046                  svp = realvp;
4042 4047  
4043 4048          /*
4044 4049           * POSIX dictates that we return EPERM here.
4045 4050           * Better choices include ENOTSUP or EISDIR.
4046 4051           */
4047 4052          if (svp->v_type == VDIR) {
4048 4053                  ZFS_EXIT(zfsvfs);
4049 4054                  return (SET_ERROR(EPERM));
4050 4055          }
4051 4056  
4052 4057          szp = VTOZ(svp);
4053 4058          ZFS_VERIFY_ZP(szp);
4054 4059  
4055 4060          /*
4056 4061           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4057 4062           * ctldir appear to have the same v_vfsp.
4058 4063           */
4059 4064          if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4060 4065                  ZFS_EXIT(zfsvfs);
4061 4066                  return (SET_ERROR(EXDEV));
4062 4067          }
4063 4068  
4064 4069          /* Prevent links to .zfs/shares files */
4065 4070  
4066 4071          if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4067 4072              &parent, sizeof (uint64_t))) != 0) {
4068 4073                  ZFS_EXIT(zfsvfs);
4069 4074                  return (error);
4070 4075          }
4071 4076          if (parent == zfsvfs->z_shares_dir) {
4072 4077                  ZFS_EXIT(zfsvfs);
4073 4078                  return (SET_ERROR(EPERM));
4074 4079          }
4075 4080  
4076 4081          if (zfsvfs->z_utf8 && u8_validate(name,
4077 4082              strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4078 4083                  ZFS_EXIT(zfsvfs);
4079 4084                  return (SET_ERROR(EILSEQ));
4080 4085          }
4081 4086          if (flags & FIGNORECASE)
4082 4087                  zf |= ZCILOOK;
4083 4088  
4084 4089          /*
4085 4090           * We do not support links between attributes and non-attributes
4086 4091           * because of the potential security risk of creating links
4087 4092           * into "normal" file space in order to circumvent restrictions
4088 4093           * imposed in attribute space.
4089 4094           */
4090 4095          if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4091 4096                  ZFS_EXIT(zfsvfs);
4092 4097                  return (SET_ERROR(EINVAL));
4093 4098          }
4094 4099  
4095 4100  
4096 4101          owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4097 4102          if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4098 4103                  ZFS_EXIT(zfsvfs);
4099 4104                  return (SET_ERROR(EPERM));
4100 4105          }
4101 4106  
4102 4107          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4103 4108                  ZFS_EXIT(zfsvfs);
4104 4109                  return (error);
4105 4110          }
4106 4111  
4107 4112  top:
4108 4113          /*
4109 4114           * Attempt to lock directory; fail if entry already exists.
4110 4115           */
4111 4116          error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4112 4117          if (error) {
4113 4118                  ZFS_EXIT(zfsvfs);
4114 4119                  return (error);
4115 4120          }
4116 4121  
4117 4122          tx = dmu_tx_create(zfsvfs->z_os);
4118 4123          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4119 4124          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4120 4125          zfs_sa_upgrade_txholds(tx, szp);
4121 4126          zfs_sa_upgrade_txholds(tx, dzp);
4122 4127          error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4123 4128          if (error) {
4124 4129                  zfs_dirent_unlock(dl);
4125 4130                  if (error == ERESTART) {
4126 4131                          waited = B_TRUE;
4127 4132                          dmu_tx_wait(tx);
4128 4133                          dmu_tx_abort(tx);
4129 4134                          goto top;
4130 4135                  }
4131 4136                  dmu_tx_abort(tx);
4132 4137                  ZFS_EXIT(zfsvfs);
4133 4138                  return (error);
4134 4139          }
4135 4140  
4136 4141          error = zfs_link_create(dl, szp, tx, 0);
4137 4142  
4138 4143          if (error == 0) {
4139 4144                  uint64_t txtype = TX_LINK;
4140 4145                  if (flags & FIGNORECASE)
4141 4146                          txtype |= TX_CI;
4142 4147                  zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4143 4148          }
4144 4149  
4145 4150          dmu_tx_commit(tx);
4146 4151  
4147 4152          zfs_dirent_unlock(dl);
4148 4153  
4149 4154          if (error == 0) {
4150 4155                  vnevent_link(svp, ct);
4151 4156          }
4152 4157  
4153 4158          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4154 4159                  zil_commit(zilog, 0);
4155 4160  
4156 4161          ZFS_EXIT(zfsvfs);
4157 4162          return (error);
4158 4163  }
4159 4164  
4160 4165  /*
4161 4166   * zfs_null_putapage() is used when the file system has been force
4162 4167   * unmounted. It just drops the pages.
4163 4168   */
4164 4169  /* ARGSUSED */
4165 4170  static int
4166 4171  zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4167 4172      size_t *lenp, int flags, cred_t *cr)
4168 4173  {
4169 4174          pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4170 4175          return (0);
4171 4176  }
4172 4177  
4173 4178  /*
4174 4179   * Push a page out to disk, klustering if possible.
4175 4180   *
4176 4181   *      IN:     vp      - file to push page to.
4177 4182   *              pp      - page to push.
4178 4183   *              flags   - additional flags.
4179 4184   *              cr      - credentials of caller.
4180 4185   *
4181 4186   *      OUT:    offp    - start of range pushed.
4182 4187   *              lenp    - len of range pushed.
4183 4188   *
4184 4189   *      RETURN: 0 on success, error code on failure.
4185 4190   *
4186 4191   * NOTE: callers must have locked the page to be pushed.  On
4187 4192   * exit, the page (and all other pages in the kluster) must be
4188 4193   * unlocked.
4189 4194   */
4190 4195  /* ARGSUSED */
4191 4196  static int
4192 4197  zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4193 4198      size_t *lenp, int flags, cred_t *cr)
4194 4199  {
4195 4200          znode_t         *zp = VTOZ(vp);
4196 4201          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4197 4202          dmu_tx_t        *tx;
4198 4203          u_offset_t      off, koff;
4199 4204          size_t          len, klen;
4200 4205          int             err;
4201 4206  
4202 4207          off = pp->p_offset;
4203 4208          len = PAGESIZE;
4204 4209          /*
4205 4210           * If our blocksize is bigger than the page size, try to kluster
4206 4211           * multiple pages so that we write a full block (thus avoiding
4207 4212           * a read-modify-write).
4208 4213           */
4209 4214          if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4210 4215                  klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4211 4216                  koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4212 4217                  ASSERT(koff <= zp->z_size);
4213 4218                  if (koff + klen > zp->z_size)
4214 4219                          klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4215 4220                  pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4216 4221          }
4217 4222          ASSERT3U(btop(len), ==, btopr(len));
4218 4223  
4219 4224          /*
4220 4225           * Can't push pages past end-of-file.
4221 4226           */
4222 4227          if (off >= zp->z_size) {
4223 4228                  /* ignore all pages */
4224 4229                  err = 0;
4225 4230                  goto out;
4226 4231          } else if (off + len > zp->z_size) {
4227 4232                  int npages = btopr(zp->z_size - off);
4228 4233                  page_t *trunc;
4229 4234  
4230 4235                  page_list_break(&pp, &trunc, npages);
4231 4236                  /* ignore pages past end of file */
4232 4237                  if (trunc)
4233 4238                          pvn_write_done(trunc, flags);
4234 4239                  len = zp->z_size - off;
4235 4240          }
4236 4241  
4237 4242          if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4238 4243              zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4239 4244                  err = SET_ERROR(EDQUOT);
4240 4245                  goto out;
4241 4246          }
4242 4247          tx = dmu_tx_create(zfsvfs->z_os);
4243 4248          dmu_tx_hold_write(tx, zp->z_id, off, len);
4244 4249  
4245 4250          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4246 4251          zfs_sa_upgrade_txholds(tx, zp);
4247 4252          err = dmu_tx_assign(tx, TXG_WAIT);
4248 4253          if (err != 0) {
4249 4254                  dmu_tx_abort(tx);
4250 4255                  goto out;
4251 4256          }
4252 4257  
4253 4258          if (zp->z_blksz <= PAGESIZE) {
4254 4259                  caddr_t va = zfs_map_page(pp, S_READ);
4255 4260                  ASSERT3U(len, <=, PAGESIZE);
4256 4261                  dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4257 4262                  zfs_unmap_page(pp, va);
4258 4263          } else {
4259 4264                  err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4260 4265          }
4261 4266  
4262 4267          if (err == 0) {
4263 4268                  uint64_t mtime[2], ctime[2];
4264 4269                  sa_bulk_attr_t bulk[3];
4265 4270                  int count = 0;
4266 4271  
4267 4272                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4268 4273                      &mtime, 16);
4269 4274                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4270 4275                      &ctime, 16);
4271 4276                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4272 4277                      &zp->z_pflags, 8);
4273 4278                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4274 4279                      B_TRUE);
4275 4280                  err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4276 4281                  ASSERT0(err);
4277 4282                  zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4278 4283          }
4279 4284          dmu_tx_commit(tx);
4280 4285  
4281 4286  out:
4282 4287          pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4283 4288          if (offp)
4284 4289                  *offp = off;
4285 4290          if (lenp)
4286 4291                  *lenp = len;
4287 4292  
4288 4293          return (err);
4289 4294  }
4290 4295  
4291 4296  /*
4292 4297   * Copy the portion of the file indicated from pages into the file.
4293 4298   * The pages are stored in a page list attached to the files vnode.
4294 4299   *
4295 4300   *      IN:     vp      - vnode of file to push page data to.
4296 4301   *              off     - position in file to put data.
4297 4302   *              len     - amount of data to write.
4298 4303   *              flags   - flags to control the operation.
4299 4304   *              cr      - credentials of caller.
4300 4305   *              ct      - caller context.
4301 4306   *
4302 4307   *      RETURN: 0 on success, error code on failure.
4303 4308   *
4304 4309   * Timestamps:
4305 4310   *      vp - ctime|mtime updated
4306 4311   */
4307 4312  /*ARGSUSED*/
4308 4313  static int
4309 4314  zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4310 4315      caller_context_t *ct)
4311 4316  {
4312 4317          znode_t         *zp = VTOZ(vp);
4313 4318          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4314 4319          page_t          *pp;
4315 4320          size_t          io_len;
4316 4321          u_offset_t      io_off;
4317 4322          uint_t          blksz;
4318 4323          locked_range_t  *lr;
4319 4324          int             error = 0;
4320 4325  
4321 4326          ZFS_ENTER(zfsvfs);
4322 4327          ZFS_VERIFY_ZP(zp);
4323 4328  
4324 4329          /*
4325 4330           * There's nothing to do if no data is cached.
4326 4331           */
4327 4332          if (!vn_has_cached_data(vp)) {
4328 4333                  ZFS_EXIT(zfsvfs);
4329 4334                  return (0);
4330 4335          }
4331 4336  
4332 4337          /*
4333 4338           * Align this request to the file block size in case we kluster.
4334 4339           * XXX - this can result in pretty aggresive locking, which can
4335 4340           * impact simultanious read/write access.  One option might be
4336 4341           * to break up long requests (len == 0) into block-by-block
4337 4342           * operations to get narrower locking.
4338 4343           */
4339 4344          blksz = zp->z_blksz;
4340 4345          if (ISP2(blksz))
4341 4346                  io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4342 4347          else
4343 4348                  io_off = 0;
4344 4349          if (len > 0 && ISP2(blksz))
4345 4350                  io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4346 4351          else
4347 4352                  io_len = 0;
4348 4353  
4349 4354          if (io_len == 0) {
4350 4355                  /*
4351 4356                   * Search the entire vp list for pages >= io_off.
4352 4357                   */
4353 4358                  lr = rangelock_enter(&zp->z_rangelock,
4354 4359                      io_off, UINT64_MAX, RL_WRITER);
4355 4360                  error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4356 4361                  goto out;
4357 4362          }
4358 4363          lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER);
4359 4364  
4360 4365          if (off > zp->z_size) {
4361 4366                  /* past end of file */
4362 4367                  rangelock_exit(lr);
4363 4368                  ZFS_EXIT(zfsvfs);
4364 4369                  return (0);
4365 4370          }
4366 4371  
4367 4372          len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4368 4373  
4369 4374          for (off = io_off; io_off < off + len; io_off += io_len) {
4370 4375                  if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4371 4376                          pp = page_lookup(vp, io_off,
4372 4377                              (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4373 4378                  } else {
4374 4379                          pp = page_lookup_nowait(vp, io_off,
4375 4380                              (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4376 4381                  }
4377 4382  
4378 4383                  if (pp != NULL && pvn_getdirty(pp, flags)) {
4379 4384                          int err;
4380 4385  
4381 4386                          /*
4382 4387                           * Found a dirty page to push
4383 4388                           */
4384 4389                          err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4385 4390                          if (err)
4386 4391                                  error = err;
4387 4392                  } else {
4388 4393                          io_len = PAGESIZE;
4389 4394                  }
4390 4395          }
4391 4396  out:
4392 4397          rangelock_exit(lr);
4393 4398          if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4394 4399                  zil_commit(zfsvfs->z_log, zp->z_id);
4395 4400          ZFS_EXIT(zfsvfs);
4396 4401          return (error);
4397 4402  }
4398 4403  
4399 4404  /*ARGSUSED*/
4400 4405  void
4401 4406  zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4402 4407  {
4403 4408          znode_t *zp = VTOZ(vp);
4404 4409          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4405 4410          int error;
4406 4411  
4407 4412          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4408 4413          if (zp->z_sa_hdl == NULL) {
4409 4414                  /*
4410 4415                   * The fs has been unmounted, or we did a
4411 4416                   * suspend/resume and this file no longer exists.
4412 4417                   */
4413 4418                  if (vn_has_cached_data(vp)) {
4414 4419                          (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4415 4420                              B_INVAL, cr);
4416 4421                  }
4417 4422  
4418 4423                  mutex_enter(&zp->z_lock);
4419 4424                  mutex_enter(&vp->v_lock);
4420 4425                  ASSERT(vp->v_count == 1);
4421 4426                  VN_RELE_LOCKED(vp);
4422 4427                  mutex_exit(&vp->v_lock);
4423 4428                  mutex_exit(&zp->z_lock);
4424 4429                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
4425 4430                  zfs_znode_free(zp);
4426 4431                  return;
4427 4432          }
4428 4433  
4429 4434          /*
4430 4435           * Attempt to push any data in the page cache.  If this fails
4431 4436           * we will get kicked out later in zfs_zinactive().
4432 4437           */
4433 4438          if (vn_has_cached_data(vp)) {
4434 4439                  (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4435 4440                      cr);
4436 4441          }
4437 4442  
4438 4443          if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4439 4444                  dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4440 4445  
4441 4446                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4442 4447                  zfs_sa_upgrade_txholds(tx, zp);
4443 4448                  error = dmu_tx_assign(tx, TXG_WAIT);
4444 4449                  if (error) {
4445 4450                          dmu_tx_abort(tx);
4446 4451                  } else {
4447 4452                          mutex_enter(&zp->z_lock);
4448 4453                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4449 4454                              (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4450 4455                          zp->z_atime_dirty = 0;
4451 4456                          mutex_exit(&zp->z_lock);
4452 4457                          dmu_tx_commit(tx);
4453 4458                  }
4454 4459          }
4455 4460  
4456 4461          zfs_zinactive(zp);
4457 4462          rw_exit(&zfsvfs->z_teardown_inactive_lock);
4458 4463  }
4459 4464  
4460 4465  /*
4461 4466   * Bounds-check the seek operation.
4462 4467   *
4463 4468   *      IN:     vp      - vnode seeking within
4464 4469   *              ooff    - old file offset
4465 4470   *              noffp   - pointer to new file offset
4466 4471   *              ct      - caller context
4467 4472   *
4468 4473   *      RETURN: 0 on success, EINVAL if new offset invalid.
4469 4474   */
4470 4475  /* ARGSUSED */
4471 4476  static int
4472 4477  zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4473 4478      caller_context_t *ct)
4474 4479  {
4475 4480          if (vp->v_type == VDIR)
4476 4481                  return (0);
4477 4482          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4478 4483  }
4479 4484  
4480 4485  /*
4481 4486   * Pre-filter the generic locking function to trap attempts to place
4482 4487   * a mandatory lock on a memory mapped file.
4483 4488   */
4484 4489  static int
4485 4490  zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4486 4491      flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4487 4492  {
4488 4493          znode_t *zp = VTOZ(vp);
4489 4494          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4490 4495  
4491 4496          ZFS_ENTER(zfsvfs);
4492 4497          ZFS_VERIFY_ZP(zp);
4493 4498  
4494 4499          /*
4495 4500           * We are following the UFS semantics with respect to mapcnt
4496 4501           * here: If we see that the file is mapped already, then we will
4497 4502           * return an error, but we don't worry about races between this
4498 4503           * function and zfs_map().
4499 4504           */
4500 4505          if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4501 4506                  ZFS_EXIT(zfsvfs);
4502 4507                  return (SET_ERROR(EAGAIN));
4503 4508          }
4504 4509          ZFS_EXIT(zfsvfs);
4505 4510          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4506 4511  }
4507 4512  
4508 4513  /*
4509 4514   * If we can't find a page in the cache, we will create a new page
4510 4515   * and fill it with file data.  For efficiency, we may try to fill
4511 4516   * multiple pages at once (klustering) to fill up the supplied page
4512 4517   * list.  Note that the pages to be filled are held with an exclusive
4513 4518   * lock to prevent access by other threads while they are being filled.
4514 4519   */
4515 4520  static int
4516 4521  zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4517 4522      caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4518 4523  {
4519 4524          znode_t *zp = VTOZ(vp);
4520 4525          page_t *pp, *cur_pp;
4521 4526          objset_t *os = zp->z_zfsvfs->z_os;
4522 4527          u_offset_t io_off, total;
4523 4528          size_t io_len;
4524 4529          int err;
4525 4530  
4526 4531          if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4527 4532                  /*
4528 4533                   * We only have a single page, don't bother klustering
4529 4534                   */
4530 4535                  io_off = off;
4531 4536                  io_len = PAGESIZE;
4532 4537                  pp = page_create_va(vp, io_off, io_len,
4533 4538                      PG_EXCL | PG_WAIT, seg, addr);
4534 4539          } else {
4535 4540                  /*
4536 4541                   * Try to find enough pages to fill the page list
4537 4542                   */
4538 4543                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4539 4544                      &io_len, off, plsz, 0);
4540 4545          }
4541 4546          if (pp == NULL) {
4542 4547                  /*
4543 4548                   * The page already exists, nothing to do here.
4544 4549                   */
4545 4550                  *pl = NULL;
4546 4551                  return (0);
4547 4552          }
4548 4553  
4549 4554          /*
4550 4555           * Fill the pages in the kluster.
4551 4556           */
4552 4557          cur_pp = pp;
4553 4558          for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4554 4559                  caddr_t va;
4555 4560  
4556 4561                  ASSERT3U(io_off, ==, cur_pp->p_offset);
4557 4562                  va = zfs_map_page(cur_pp, S_WRITE);
4558 4563                  err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4559 4564                      DMU_READ_PREFETCH);
4560 4565                  zfs_unmap_page(cur_pp, va);
4561 4566                  if (err) {
4562 4567                          /* On error, toss the entire kluster */
4563 4568                          pvn_read_done(pp, B_ERROR);
4564 4569                          /* convert checksum errors into IO errors */
4565 4570                          if (err == ECKSUM)
4566 4571                                  err = SET_ERROR(EIO);
4567 4572                          return (err);
4568 4573                  }
4569 4574                  cur_pp = cur_pp->p_next;
4570 4575          }
4571 4576  
4572 4577          /*
4573 4578           * Fill in the page list array from the kluster starting
4574 4579           * from the desired offset `off'.
4575 4580           * NOTE: the page list will always be null terminated.
4576 4581           */
4577 4582          pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4578 4583          ASSERT(pl == NULL || (*pl)->p_offset == off);
4579 4584  
4580 4585          return (0);
4581 4586  }
4582 4587  
4583 4588  /*
4584 4589   * Return pointers to the pages for the file region [off, off + len]
4585 4590   * in the pl array.  If plsz is greater than len, this function may
4586 4591   * also return page pointers from after the specified region
4587 4592   * (i.e. the region [off, off + plsz]).  These additional pages are
4588 4593   * only returned if they are already in the cache, or were created as
4589 4594   * part of a klustered read.
4590 4595   *
4591 4596   *      IN:     vp      - vnode of file to get data from.
4592 4597   *              off     - position in file to get data from.
4593 4598   *              len     - amount of data to retrieve.
4594 4599   *              plsz    - length of provided page list.
4595 4600   *              seg     - segment to obtain pages for.
4596 4601   *              addr    - virtual address of fault.
4597 4602   *              rw      - mode of created pages.
4598 4603   *              cr      - credentials of caller.
4599 4604   *              ct      - caller context.
4600 4605   *
4601 4606   *      OUT:    protp   - protection mode of created pages.
4602 4607   *              pl      - list of pages created.
4603 4608   *
4604 4609   *      RETURN: 0 on success, error code on failure.
4605 4610   *
4606 4611   * Timestamps:
4607 4612   *      vp - atime updated
4608 4613   */
4609 4614  /* ARGSUSED */
4610 4615  static int
4611 4616  zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4612 4617      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4613 4618      enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4614 4619  {
4615 4620          znode_t         *zp = VTOZ(vp);
4616 4621          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4617 4622          page_t          **pl0 = pl;
4618 4623          int             err = 0;
4619 4624  
4620 4625          /* we do our own caching, faultahead is unnecessary */
4621 4626          if (pl == NULL)
4622 4627                  return (0);
4623 4628          else if (len > plsz)
4624 4629                  len = plsz;
4625 4630          else
4626 4631                  len = P2ROUNDUP(len, PAGESIZE);
4627 4632          ASSERT(plsz >= len);
4628 4633  
4629 4634          ZFS_ENTER(zfsvfs);
4630 4635          ZFS_VERIFY_ZP(zp);
4631 4636  
4632 4637          if (protp)
4633 4638                  *protp = PROT_ALL;
4634 4639  
4635 4640          /*
4636 4641           * Loop through the requested range [off, off + len) looking
4637 4642           * for pages.  If we don't find a page, we will need to create
4638 4643           * a new page and fill it with data from the file.
4639 4644           */
4640 4645          while (len > 0) {
4641 4646                  if (*pl = page_lookup(vp, off, SE_SHARED))
4642 4647                          *(pl+1) = NULL;
4643 4648                  else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4644 4649                          goto out;
4645 4650                  while (*pl) {
4646 4651                          ASSERT3U((*pl)->p_offset, ==, off);
4647 4652                          off += PAGESIZE;
4648 4653                          addr += PAGESIZE;
4649 4654                          if (len > 0) {
4650 4655                                  ASSERT3U(len, >=, PAGESIZE);
4651 4656                                  len -= PAGESIZE;
4652 4657                          }
4653 4658                          ASSERT3U(plsz, >=, PAGESIZE);
4654 4659                          plsz -= PAGESIZE;
4655 4660                          pl++;
4656 4661                  }
4657 4662          }
4658 4663  
4659 4664          /*
4660 4665           * Fill out the page array with any pages already in the cache.
4661 4666           */
4662 4667          while (plsz > 0 &&
4663 4668              (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4664 4669                          off += PAGESIZE;
4665 4670                          plsz -= PAGESIZE;
4666 4671          }
4667 4672  out:
4668 4673          if (err) {
4669 4674                  /*
4670 4675                   * Release any pages we have previously locked.
4671 4676                   */
4672 4677                  while (pl > pl0)
4673 4678                          page_unlock(*--pl);
4674 4679          } else {
4675 4680                  ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4676 4681          }
4677 4682  
4678 4683          *pl = NULL;
4679 4684  
4680 4685          ZFS_EXIT(zfsvfs);
4681 4686          return (err);
4682 4687  }
4683 4688  
4684 4689  /*
4685 4690   * Request a memory map for a section of a file.  This code interacts
4686 4691   * with common code and the VM system as follows:
4687 4692   *
4688 4693   * - common code calls mmap(), which ends up in smmap_common()
4689 4694   * - this calls VOP_MAP(), which takes you into (say) zfs
4690 4695   * - zfs_map() calls as_map(), passing segvn_create() as the callback
4691 4696   * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4692 4697   * - zfs_addmap() updates z_mapcnt
4693 4698   */
4694 4699  /*ARGSUSED*/
4695 4700  static int
4696 4701  zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4697 4702      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4698 4703      caller_context_t *ct)
4699 4704  {
4700 4705          znode_t *zp = VTOZ(vp);
4701 4706          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4702 4707          segvn_crargs_t  vn_a;
4703 4708          int             error;
4704 4709  
4705 4710          ZFS_ENTER(zfsvfs);
4706 4711          ZFS_VERIFY_ZP(zp);
4707 4712  
4708 4713          /*
4709 4714           * Note: ZFS_READONLY is handled in zfs_zaccess_common.
4710 4715           */
4711 4716  
4712 4717          if ((prot & PROT_WRITE) && (zp->z_pflags &
4713 4718              (ZFS_IMMUTABLE | ZFS_APPENDONLY))) {
4714 4719                  ZFS_EXIT(zfsvfs);
4715 4720                  return (SET_ERROR(EPERM));
4716 4721          }
4717 4722  
4718 4723          if ((prot & (PROT_READ | PROT_EXEC)) &&
4719 4724              (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4720 4725                  ZFS_EXIT(zfsvfs);
4721 4726                  return (SET_ERROR(EACCES));
4722 4727          }
4723 4728  
4724 4729          if (vp->v_flag & VNOMAP) {
4725 4730                  ZFS_EXIT(zfsvfs);
4726 4731                  return (SET_ERROR(ENOSYS));
4727 4732          }
4728 4733  
4729 4734          if (off < 0 || len > MAXOFFSET_T - off) {
4730 4735                  ZFS_EXIT(zfsvfs);
4731 4736                  return (SET_ERROR(ENXIO));
4732 4737          }
4733 4738  
4734 4739          if (vp->v_type != VREG) {
4735 4740                  ZFS_EXIT(zfsvfs);
4736 4741                  return (SET_ERROR(ENODEV));
4737 4742          }
4738 4743  
4739 4744          /*
4740 4745           * If file is locked, disallow mapping.
4741 4746           */
4742 4747          if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4743 4748                  ZFS_EXIT(zfsvfs);
4744 4749                  return (SET_ERROR(EAGAIN));
4745 4750          }
4746 4751  
4747 4752          as_rangelock(as);
4748 4753          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4749 4754          if (error != 0) {
4750 4755                  as_rangeunlock(as);
4751 4756                  ZFS_EXIT(zfsvfs);
4752 4757                  return (error);
4753 4758          }
4754 4759  
4755 4760          vn_a.vp = vp;
4756 4761          vn_a.offset = (u_offset_t)off;
4757 4762          vn_a.type = flags & MAP_TYPE;
4758 4763          vn_a.prot = prot;
4759 4764          vn_a.maxprot = maxprot;
4760 4765          vn_a.cred = cr;
4761 4766          vn_a.amp = NULL;
4762 4767          vn_a.flags = flags & ~MAP_TYPE;
4763 4768          vn_a.szc = 0;
4764 4769          vn_a.lgrp_mem_policy_flags = 0;
4765 4770  
4766 4771          error = as_map(as, *addrp, len, segvn_create, &vn_a);
4767 4772  
4768 4773          as_rangeunlock(as);
4769 4774          ZFS_EXIT(zfsvfs);
4770 4775          return (error);
4771 4776  }
4772 4777  
4773 4778  /* ARGSUSED */
4774 4779  static int
4775 4780  zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4776 4781      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4777 4782      caller_context_t *ct)
4778 4783  {
4779 4784          uint64_t pages = btopr(len);
4780 4785  
4781 4786          atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4782 4787          return (0);
4783 4788  }
4784 4789  
4785 4790  /*
4786 4791   * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4787 4792   * more accurate mtime for the associated file.  Since we don't have a way of
4788 4793   * detecting when the data was actually modified, we have to resort to
4789 4794   * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4790 4795   * last page is pushed.  The problem occurs when the msync() call is omitted,
4791 4796   * which by far the most common case:
4792 4797   *
4793 4798   *      open()
4794 4799   *      mmap()
4795 4800   *      <modify memory>
4796 4801   *      munmap()
4797 4802   *      close()
4798 4803   *      <time lapse>
4799 4804   *      putpage() via fsflush
4800 4805   *
4801 4806   * If we wait until fsflush to come along, we can have a modification time that
4802 4807   * is some arbitrary point in the future.  In order to prevent this in the
4803 4808   * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4804 4809   * torn down.
4805 4810   */
4806 4811  /* ARGSUSED */
4807 4812  static int
4808 4813  zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4809 4814      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4810 4815      caller_context_t *ct)
4811 4816  {
4812 4817          uint64_t pages = btopr(len);
4813 4818  
4814 4819          ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4815 4820          atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4816 4821  
4817 4822          if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4818 4823              vn_has_cached_data(vp))
4819 4824                  (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4820 4825  
4821 4826          return (0);
4822 4827  }
4823 4828  
4824 4829  /*
4825 4830   * Free or allocate space in a file.  Currently, this function only
4826 4831   * supports the `F_FREESP' command.  However, this command is somewhat
4827 4832   * misnamed, as its functionality includes the ability to allocate as
4828 4833   * well as free space.
4829 4834   *
4830 4835   *      IN:     vp      - vnode of file to free data in.
4831 4836   *              cmd     - action to take (only F_FREESP supported).
4832 4837   *              bfp     - section of file to free/alloc.
4833 4838   *              flag    - current file open mode flags.
4834 4839   *              offset  - current file offset.
4835 4840   *              cr      - credentials of caller [UNUSED].
4836 4841   *              ct      - caller context.
4837 4842   *
4838 4843   *      RETURN: 0 on success, error code on failure.
4839 4844   *
4840 4845   * Timestamps:
4841 4846   *      vp - ctime|mtime updated
4842 4847   */
4843 4848  /* ARGSUSED */
4844 4849  static int
4845 4850  zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4846 4851      offset_t offset, cred_t *cr, caller_context_t *ct)
4847 4852  {
4848 4853          znode_t         *zp = VTOZ(vp);
4849 4854          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4850 4855          uint64_t        off, len;
4851 4856          int             error;
4852 4857  
4853 4858          ZFS_ENTER(zfsvfs);
4854 4859          ZFS_VERIFY_ZP(zp);
4855 4860  
4856 4861          if (cmd != F_FREESP) {
4857 4862                  ZFS_EXIT(zfsvfs);
4858 4863                  return (SET_ERROR(EINVAL));
4859 4864          }
4860 4865  
4861 4866          /*
4862 4867           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
4863 4868           * callers might not be able to detect properly that we are read-only,
4864 4869           * so check it explicitly here.
4865 4870           */
4866 4871          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
4867 4872                  ZFS_EXIT(zfsvfs);
4868 4873                  return (SET_ERROR(EROFS));
4869 4874          }
4870 4875  
4871 4876          if (error = convoff(vp, bfp, 0, offset)) {
4872 4877                  ZFS_EXIT(zfsvfs);
4873 4878                  return (error);
4874 4879          }
4875 4880  
4876 4881          if (bfp->l_len < 0) {
4877 4882                  ZFS_EXIT(zfsvfs);
4878 4883                  return (SET_ERROR(EINVAL));
4879 4884          }
4880 4885  
4881 4886          off = bfp->l_start;
4882 4887          len = bfp->l_len; /* 0 means from off to end of file */
4883 4888  
4884 4889          error = zfs_freesp(zp, off, len, flag, TRUE);
4885 4890  
4886 4891          if (error == 0 && off == 0 && len == 0)
4887 4892                  vnevent_truncate(ZTOV(zp), ct);
4888 4893  
4889 4894          ZFS_EXIT(zfsvfs);
4890 4895          return (error);
4891 4896  }
4892 4897  
4893 4898  /*ARGSUSED*/
4894 4899  static int
4895 4900  zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4896 4901  {
4897 4902          znode_t         *zp = VTOZ(vp);
4898 4903          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4899 4904          uint32_t        gen;
4900 4905          uint64_t        gen64;
4901 4906          uint64_t        object = zp->z_id;
4902 4907          zfid_short_t    *zfid;
4903 4908          int             size, i, error;
4904 4909  
4905 4910          ZFS_ENTER(zfsvfs);
4906 4911          ZFS_VERIFY_ZP(zp);
4907 4912  
4908 4913          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4909 4914              &gen64, sizeof (uint64_t))) != 0) {
4910 4915                  ZFS_EXIT(zfsvfs);
4911 4916                  return (error);
4912 4917          }
4913 4918  
4914 4919          gen = (uint32_t)gen64;
4915 4920  
4916 4921          size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4917 4922          if (fidp->fid_len < size) {
4918 4923                  fidp->fid_len = size;
4919 4924                  ZFS_EXIT(zfsvfs);
4920 4925                  return (SET_ERROR(ENOSPC));
4921 4926          }
4922 4927  
4923 4928          zfid = (zfid_short_t *)fidp;
4924 4929  
4925 4930          zfid->zf_len = size;
4926 4931  
4927 4932          for (i = 0; i < sizeof (zfid->zf_object); i++)
4928 4933                  zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4929 4934  
4930 4935          /* Must have a non-zero generation number to distinguish from .zfs */
4931 4936          if (gen == 0)
4932 4937                  gen = 1;
4933 4938          for (i = 0; i < sizeof (zfid->zf_gen); i++)
4934 4939                  zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4935 4940  
4936 4941          if (size == LONG_FID_LEN) {
4937 4942                  uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
4938 4943                  zfid_long_t     *zlfid;
4939 4944  
4940 4945                  zlfid = (zfid_long_t *)fidp;
4941 4946  
4942 4947                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4943 4948                          zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4944 4949  
4945 4950                  /* XXX - this should be the generation number for the objset */
4946 4951                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4947 4952                          zlfid->zf_setgen[i] = 0;
4948 4953          }
4949 4954  
4950 4955          ZFS_EXIT(zfsvfs);
4951 4956          return (0);
4952 4957  }
4953 4958  
4954 4959  static int
4955 4960  zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4956 4961      caller_context_t *ct)
4957 4962  {
4958 4963          znode_t         *zp, *xzp;
4959 4964          zfsvfs_t        *zfsvfs;
4960 4965          zfs_dirlock_t   *dl;
4961 4966          int             error;
4962 4967  
4963 4968          switch (cmd) {
4964 4969          case _PC_LINK_MAX:
4965 4970                  *valp = ULONG_MAX;
4966 4971                  return (0);
4967 4972  
4968 4973          case _PC_FILESIZEBITS:
4969 4974                  *valp = 64;
4970 4975                  return (0);
4971 4976  
4972 4977          case _PC_XATTR_EXISTS:
4973 4978                  zp = VTOZ(vp);
4974 4979                  zfsvfs = zp->z_zfsvfs;
4975 4980                  ZFS_ENTER(zfsvfs);
4976 4981                  ZFS_VERIFY_ZP(zp);
4977 4982                  *valp = 0;
4978 4983                  error = zfs_dirent_lock(&dl, zp, "", &xzp,
4979 4984                      ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4980 4985                  if (error == 0) {
4981 4986                          zfs_dirent_unlock(dl);
4982 4987                          if (!zfs_dirempty(xzp))
4983 4988                                  *valp = 1;
4984 4989                          VN_RELE(ZTOV(xzp));
4985 4990                  } else if (error == ENOENT) {
4986 4991                          /*
4987 4992                           * If there aren't extended attributes, it's the
4988 4993                           * same as having zero of them.
4989 4994                           */
4990 4995                          error = 0;
4991 4996                  }
4992 4997                  ZFS_EXIT(zfsvfs);
4993 4998                  return (error);
4994 4999  
4995 5000          case _PC_SATTR_ENABLED:
4996 5001          case _PC_SATTR_EXISTS:
4997 5002                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4998 5003                      (vp->v_type == VREG || vp->v_type == VDIR);
4999 5004                  return (0);
5000 5005  
5001 5006          case _PC_ACCESS_FILTERING:
5002 5007                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5003 5008                      vp->v_type == VDIR;
5004 5009                  return (0);
5005 5010  
5006 5011          case _PC_ACL_ENABLED:
5007 5012                  *valp = _ACL_ACE_ENABLED;
5008 5013                  return (0);
5009 5014  
5010 5015          case _PC_MIN_HOLE_SIZE:
5011 5016                  *valp = (ulong_t)SPA_MINBLOCKSIZE;
5012 5017                  return (0);
5013 5018  
5014 5019          case _PC_TIMESTAMP_RESOLUTION:
5015 5020                  /* nanosecond timestamp resolution */
5016 5021                  *valp = 1L;
5017 5022                  return (0);
5018 5023  
5019 5024          default:
5020 5025                  return (fs_pathconf(vp, cmd, valp, cr, ct));
5021 5026          }
5022 5027  }
5023 5028  
5024 5029  /*ARGSUSED*/
5025 5030  static int
5026 5031  zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5027 5032      caller_context_t *ct)
5028 5033  {
5029 5034          znode_t *zp = VTOZ(vp);
5030 5035          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5031 5036          int error;
5032 5037          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5033 5038  
5034 5039          ZFS_ENTER(zfsvfs);
5035 5040          ZFS_VERIFY_ZP(zp);
5036 5041          error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5037 5042          ZFS_EXIT(zfsvfs);
5038 5043  
5039 5044          return (error);
5040 5045  }
5041 5046  
5042 5047  /*ARGSUSED*/
5043 5048  static int
5044 5049  zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5045 5050      caller_context_t *ct)
5046 5051  {
5047 5052          znode_t *zp = VTOZ(vp);
5048 5053          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5049 5054          int error;
5050 5055          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5051 5056          zilog_t *zilog = zfsvfs->z_log;
5052 5057  
5053 5058          ZFS_ENTER(zfsvfs);
5054 5059          ZFS_VERIFY_ZP(zp);
5055 5060  
5056 5061          error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5057 5062  
5058 5063          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5059 5064                  zil_commit(zilog, 0);
5060 5065  
5061 5066          ZFS_EXIT(zfsvfs);
5062 5067          return (error);
5063 5068  }
5064 5069  
5065 5070  /*
5066 5071   * The smallest read we may consider to loan out an arcbuf.
5067 5072   * This must be a power of 2.
5068 5073   */
5069 5074  int zcr_blksz_min = (1 << 10);  /* 1K */
5070 5075  /*
5071 5076   * If set to less than the file block size, allow loaning out of an
5072 5077   * arcbuf for a partial block read.  This must be a power of 2.
5073 5078   */
5074 5079  int zcr_blksz_max = (1 << 17);  /* 128K */
5075 5080  
5076 5081  /*ARGSUSED*/
5077 5082  static int
5078 5083  zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5079 5084      caller_context_t *ct)
5080 5085  {
5081 5086          znode_t *zp = VTOZ(vp);
5082 5087          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5083 5088          int max_blksz = zfsvfs->z_max_blksz;
5084 5089          uio_t *uio = &xuio->xu_uio;
5085 5090          ssize_t size = uio->uio_resid;
5086 5091          offset_t offset = uio->uio_loffset;
5087 5092          int blksz;
5088 5093          int fullblk, i;
5089 5094          arc_buf_t *abuf;
5090 5095          ssize_t maxsize;
5091 5096          int preamble, postamble;
5092 5097  
5093 5098          if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5094 5099                  return (SET_ERROR(EINVAL));
5095 5100  
5096 5101          ZFS_ENTER(zfsvfs);
5097 5102          ZFS_VERIFY_ZP(zp);
5098 5103          switch (ioflag) {
5099 5104          case UIO_WRITE:
5100 5105                  /*
5101 5106                   * Loan out an arc_buf for write if write size is bigger than
5102 5107                   * max_blksz, and the file's block size is also max_blksz.
5103 5108                   */
5104 5109                  blksz = max_blksz;
5105 5110                  if (size < blksz || zp->z_blksz != blksz) {
5106 5111                          ZFS_EXIT(zfsvfs);
5107 5112                          return (SET_ERROR(EINVAL));
5108 5113                  }
5109 5114                  /*
5110 5115                   * Caller requests buffers for write before knowing where the
5111 5116                   * write offset might be (e.g. NFS TCP write).
5112 5117                   */
5113 5118                  if (offset == -1) {
5114 5119                          preamble = 0;
5115 5120                  } else {
5116 5121                          preamble = P2PHASE(offset, blksz);
5117 5122                          if (preamble) {
5118 5123                                  preamble = blksz - preamble;
5119 5124                                  size -= preamble;
5120 5125                          }
5121 5126                  }
5122 5127  
5123 5128                  postamble = P2PHASE(size, blksz);
5124 5129                  size -= postamble;
5125 5130  
5126 5131                  fullblk = size / blksz;
5127 5132                  (void) dmu_xuio_init(xuio,
5128 5133                      (preamble != 0) + fullblk + (postamble != 0));
5129 5134                  DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5130 5135                      int, postamble, int,
5131 5136                      (preamble != 0) + fullblk + (postamble != 0));
5132 5137  
5133 5138                  /*
5134 5139                   * Have to fix iov base/len for partial buffers.  They
5135 5140                   * currently represent full arc_buf's.
5136 5141                   */
5137 5142                  if (preamble) {
5138 5143                          /* data begins in the middle of the arc_buf */
5139 5144                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5140 5145                              blksz);
5141 5146                          ASSERT(abuf);
5142 5147                          (void) dmu_xuio_add(xuio, abuf,
5143 5148                              blksz - preamble, preamble);
5144 5149                  }
5145 5150  
5146 5151                  for (i = 0; i < fullblk; i++) {
5147 5152                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5148 5153                              blksz);
5149 5154                          ASSERT(abuf);
5150 5155                          (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5151 5156                  }
5152 5157  
5153 5158                  if (postamble) {
5154 5159                          /* data ends in the middle of the arc_buf */
5155 5160                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5156 5161                              blksz);
5157 5162                          ASSERT(abuf);
5158 5163                          (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5159 5164                  }
5160 5165                  break;
5161 5166          case UIO_READ:
5162 5167                  /*
5163 5168                   * Loan out an arc_buf for read if the read size is larger than
5164 5169                   * the current file block size.  Block alignment is not
5165 5170                   * considered.  Partial arc_buf will be loaned out for read.
5166 5171                   */
5167 5172                  blksz = zp->z_blksz;
5168 5173                  if (blksz < zcr_blksz_min)
5169 5174                          blksz = zcr_blksz_min;
5170 5175                  if (blksz > zcr_blksz_max)
5171 5176                          blksz = zcr_blksz_max;
5172 5177                  /* avoid potential complexity of dealing with it */
5173 5178                  if (blksz > max_blksz) {
5174 5179                          ZFS_EXIT(zfsvfs);
5175 5180                          return (SET_ERROR(EINVAL));
5176 5181                  }
5177 5182  
5178 5183                  maxsize = zp->z_size - uio->uio_loffset;
5179 5184                  if (size > maxsize)
5180 5185                          size = maxsize;
5181 5186  
5182 5187                  if (size < blksz || vn_has_cached_data(vp)) {
5183 5188                          ZFS_EXIT(zfsvfs);
5184 5189                          return (SET_ERROR(EINVAL));
5185 5190                  }
5186 5191                  break;
5187 5192          default:
5188 5193                  ZFS_EXIT(zfsvfs);
5189 5194                  return (SET_ERROR(EINVAL));
5190 5195          }
5191 5196  
5192 5197          uio->uio_extflg = UIO_XUIO;
5193 5198          XUIO_XUZC_RW(xuio) = ioflag;
5194 5199          ZFS_EXIT(zfsvfs);
5195 5200          return (0);
5196 5201  }
5197 5202  
5198 5203  /*ARGSUSED*/
5199 5204  static int
5200 5205  zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5201 5206  {
5202 5207          int i;
5203 5208          arc_buf_t *abuf;
5204 5209          int ioflag = XUIO_XUZC_RW(xuio);
5205 5210  
5206 5211          ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5207 5212  
5208 5213          i = dmu_xuio_cnt(xuio);
5209 5214          while (i-- > 0) {
5210 5215                  abuf = dmu_xuio_arcbuf(xuio, i);
5211 5216                  /*
5212 5217                   * if abuf == NULL, it must be a write buffer
5213 5218                   * that has been returned in zfs_write().
5214 5219                   */
5215 5220                  if (abuf)
5216 5221                          dmu_return_arcbuf(abuf);
5217 5222                  ASSERT(abuf || ioflag == UIO_WRITE);
5218 5223          }
5219 5224  
5220 5225          dmu_xuio_fini(xuio);
5221 5226          return (0);
5222 5227  }
5223 5228  
5224 5229  /*
5225 5230   * Predeclare these here so that the compiler assumes that
5226 5231   * this is an "old style" function declaration that does
5227 5232   * not include arguments => we won't get type mismatch errors
5228 5233   * in the initializations that follow.
5229 5234   */
5230 5235  static int zfs_inval();
5231 5236  static int zfs_isdir();
5232 5237  
5233 5238  static int
5234 5239  zfs_inval()
5235 5240  {
5236 5241          return (SET_ERROR(EINVAL));
5237 5242  }
5238 5243  
5239 5244  static int
5240 5245  zfs_isdir()
5241 5246  {
5242 5247          return (SET_ERROR(EISDIR));
5243 5248  }
5244 5249  /*
5245 5250   * Directory vnode operations template
5246 5251   */
5247 5252  vnodeops_t *zfs_dvnodeops;
5248 5253  const fs_operation_def_t zfs_dvnodeops_template[] = {
5249 5254          VOPNAME_OPEN,           { .vop_open = zfs_open },
5250 5255          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5251 5256          VOPNAME_READ,           { .error = zfs_isdir },
5252 5257          VOPNAME_WRITE,          { .error = zfs_isdir },
5253 5258          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5254 5259          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5255 5260          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5256 5261          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5257 5262          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5258 5263          VOPNAME_CREATE,         { .vop_create = zfs_create },
5259 5264          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5260 5265          VOPNAME_LINK,           { .vop_link = zfs_link },
5261 5266          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5262 5267          VOPNAME_MKDIR,          { .vop_mkdir = zfs_mkdir },
5263 5268          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5264 5269          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5265 5270          VOPNAME_SYMLINK,        { .vop_symlink = zfs_symlink },
5266 5271          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5267 5272          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5268 5273          VOPNAME_FID,            { .vop_fid = zfs_fid },
5269 5274          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5270 5275          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5271 5276          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5272 5277          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5273 5278          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5274 5279          NULL,                   NULL
5275 5280  };
5276 5281  
5277 5282  /*
5278 5283   * Regular file vnode operations template
5279 5284   */
5280 5285  vnodeops_t *zfs_fvnodeops;
5281 5286  const fs_operation_def_t zfs_fvnodeops_template[] = {
5282 5287          VOPNAME_OPEN,           { .vop_open = zfs_open },
5283 5288          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5284 5289          VOPNAME_READ,           { .vop_read = zfs_read },
5285 5290          VOPNAME_WRITE,          { .vop_write = zfs_write },
5286 5291          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5287 5292          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5288 5293          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5289 5294          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5290 5295          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5291 5296          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5292 5297          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5293 5298          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5294 5299          VOPNAME_FID,            { .vop_fid = zfs_fid },
5295 5300          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5296 5301          VOPNAME_FRLOCK,         { .vop_frlock = zfs_frlock },
5297 5302          VOPNAME_SPACE,          { .vop_space = zfs_space },
5298 5303          VOPNAME_GETPAGE,        { .vop_getpage = zfs_getpage },
5299 5304          VOPNAME_PUTPAGE,        { .vop_putpage = zfs_putpage },
5300 5305          VOPNAME_MAP,            { .vop_map = zfs_map },
5301 5306          VOPNAME_ADDMAP,         { .vop_addmap = zfs_addmap },
5302 5307          VOPNAME_DELMAP,         { .vop_delmap = zfs_delmap },
5303 5308          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5304 5309          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5305 5310          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5306 5311          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5307 5312          VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
5308 5313          VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
5309 5314          NULL,                   NULL
5310 5315  };
5311 5316  
5312 5317  /*
5313 5318   * Symbolic link vnode operations template
5314 5319   */
5315 5320  vnodeops_t *zfs_symvnodeops;
5316 5321  const fs_operation_def_t zfs_symvnodeops_template[] = {
5317 5322          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5318 5323          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5319 5324          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5320 5325          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5321 5326          VOPNAME_READLINK,       { .vop_readlink = zfs_readlink },
5322 5327          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5323 5328          VOPNAME_FID,            { .vop_fid = zfs_fid },
5324 5329          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5325 5330          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5326 5331          NULL,                   NULL
5327 5332  };
5328 5333  
5329 5334  /*
5330 5335   * special share hidden files vnode operations template
5331 5336   */
5332 5337  vnodeops_t *zfs_sharevnodeops;
5333 5338  const fs_operation_def_t zfs_sharevnodeops_template[] = {
5334 5339          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5335 5340          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5336 5341          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5337 5342          VOPNAME_FID,            { .vop_fid = zfs_fid },
5338 5343          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5339 5344          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5340 5345          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5341 5346          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5342 5347          NULL,                   NULL
5343 5348  };
5344 5349  
5345 5350  /*
5346 5351   * Extended attribute directory vnode operations template
5347 5352   *
5348 5353   * This template is identical to the directory vnodes
5349 5354   * operation template except for restricted operations:
5350 5355   *      VOP_MKDIR()
5351 5356   *      VOP_SYMLINK()
5352 5357   *
5353 5358   * Note that there are other restrictions embedded in:
5354 5359   *      zfs_create()    - restrict type to VREG
5355 5360   *      zfs_link()      - no links into/out of attribute space
5356 5361   *      zfs_rename()    - no moves into/out of attribute space
5357 5362   */
5358 5363  vnodeops_t *zfs_xdvnodeops;
5359 5364  const fs_operation_def_t zfs_xdvnodeops_template[] = {
5360 5365          VOPNAME_OPEN,           { .vop_open = zfs_open },
5361 5366          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5362 5367          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5363 5368          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5364 5369          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5365 5370          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5366 5371          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5367 5372          VOPNAME_CREATE,         { .vop_create = zfs_create },
5368 5373          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5369 5374          VOPNAME_LINK,           { .vop_link = zfs_link },
5370 5375          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5371 5376          VOPNAME_MKDIR,          { .error = zfs_inval },
5372 5377          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5373 5378          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5374 5379          VOPNAME_SYMLINK,        { .error = zfs_inval },
5375 5380          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5376 5381          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5377 5382          VOPNAME_FID,            { .vop_fid = zfs_fid },
5378 5383          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5379 5384          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5380 5385          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5381 5386          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5382 5387          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5383 5388          NULL,                   NULL
5384 5389  };
5385 5390  
5386 5391  /*
5387 5392   * Error vnode operations template
5388 5393   */
5389 5394  vnodeops_t *zfs_evnodeops;
5390 5395  const fs_operation_def_t zfs_evnodeops_template[] = {
5391 5396          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5392 5397          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5393 5398          NULL,                   NULL
5394 5399  };

↓ open down ↓

4394 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX