Print this page
    
OS-5148 ftruncate at offset should emit proper events
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-5291 lxbrand inotify02 LTP regression
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
OS-4319 zfs mishandles partial writes
OS-3294 add support for inotify
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  
    | 
      ↓ open down ↓ | 
    15 lines elided | 
    
      ↑ open up ↑ | 
  
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24   24   * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26      - * Copyright 2015 Joyent, Inc.
       26 + * Copyright 2016 Joyent, Inc.
  27   27   */
  28   28  
  29   29  /* Portions Copyright 2007 Jeremy Teo */
  30   30  /* Portions Copyright 2010 Robert Milkowski */
  31   31  
  32   32  #include <sys/types.h>
  33   33  #include <sys/param.h>
  34   34  #include <sys/time.h>
  35   35  #include <sys/systm.h>
  36   36  #include <sys/sysmacros.h>
  37   37  #include <sys/resource.h>
  38   38  #include <sys/vfs.h>
  39   39  #include <sys/vfs_opreg.h>
  40   40  #include <sys/vnode.h>
  41   41  #include <sys/file.h>
  42   42  #include <sys/stat.h>
  43   43  #include <sys/kmem.h>
  44   44  #include <sys/taskq.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/vmsystm.h>
  47   47  #include <sys/atomic.h>
  48   48  #include <sys/vm.h>
  49   49  #include <vm/seg_vn.h>
  50   50  #include <vm/pvn.h>
  51   51  #include <vm/as.h>
  52   52  #include <vm/kpm.h>
  53   53  #include <vm/seg_kpm.h>
  54   54  #include <sys/mman.h>
  55   55  #include <sys/pathname.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/errno.h>
  58   58  #include <sys/unistd.h>
  59   59  #include <sys/zfs_dir.h>
  60   60  #include <sys/zfs_acl.h>
  61   61  #include <sys/zfs_ioctl.h>
  62   62  #include <sys/fs/zfs.h>
  63   63  #include <sys/dmu.h>
  64   64  #include <sys/dmu_objset.h>
  65   65  #include <sys/spa.h>
  66   66  #include <sys/txg.h>
  67   67  #include <sys/dbuf.h>
  68   68  #include <sys/zap.h>
  69   69  #include <sys/sa.h>
  70   70  #include <sys/dirent.h>
  71   71  #include <sys/policy.h>
  72   72  #include <sys/sunddi.h>
  73   73  #include <sys/filio.h>
  74   74  #include <sys/sid.h>
  75   75  #include "fs/fs_subr.h"
  76   76  #include <sys/zfs_ctldir.h>
  77   77  #include <sys/zfs_fuid.h>
  78   78  #include <sys/zfs_sa.h>
  79   79  #include <sys/dnlc.h>
  80   80  #include <sys/zfs_rlock.h>
  81   81  #include <sys/extdirent.h>
  82   82  #include <sys/kidmap.h>
  83   83  #include <sys/cred.h>
  84   84  #include <sys/attr.h>
  85   85  
  86   86  /*
  87   87   * Programming rules.
  88   88   *
  89   89   * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  90   90   * properly lock its in-core state, create a DMU transaction, do the work,
  91   91   * record this work in the intent log (ZIL), commit the DMU transaction,
  92   92   * and wait for the intent log to commit if it is a synchronous operation.
  93   93   * Moreover, the vnode ops must work in both normal and log replay context.
  94   94   * The ordering of events is important to avoid deadlocks and references
  95   95   * to freed memory.  The example below illustrates the following Big Rules:
  96   96   *
  97   97   *  (1) A check must be made in each zfs thread for a mounted file system.
  98   98   *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  99   99   *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
 100  100   *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
 101  101   *      can return EIO from the calling function.
 102  102   *
 103  103   *  (2) VN_RELE() should always be the last thing except for zil_commit()
 104  104   *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 105  105   *      First, if it's the last reference, the vnode/znode
 106  106   *      can be freed, so the zp may point to freed memory.  Second, the last
 107  107   *      reference will call zfs_zinactive(), which may induce a lot of work --
 108  108   *      pushing cached pages (which acquires range locks) and syncing out
 109  109   *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 110  110   *      which could deadlock the system if you were already holding one.
 111  111   *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 112  112   *
 113  113   *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 114  114   *      as they can span dmu_tx_assign() calls.
 115  115   *
 116  116   *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 117  117   *      dmu_tx_assign().  This is critical because we don't want to block
 118  118   *      while holding locks.
 119  119   *
 120  120   *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 121  121   *      reduces lock contention and CPU usage when we must wait (note that if
 122  122   *      throughput is constrained by the storage, nearly every transaction
 123  123   *      must wait).
 124  124   *
 125  125   *      Note, in particular, that if a lock is sometimes acquired before
 126  126   *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 127  127   *      to use a non-blocking assign can deadlock the system.  The scenario:
 128  128   *
 129  129   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 130  130   *      Thread B is in an already-assigned tx, and blocks for this lock.
 131  131   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 132  132   *      forever, because the previous txg can't quiesce until B's tx commits.
 133  133   *
 134  134   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 135  135   *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 136  136   *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
 137  137   *      to indicate that this operation has already called dmu_tx_wait().
 138  138   *      This will ensure that we don't retry forever, waiting a short bit
 139  139   *      each time.
 140  140   *
 141  141   *  (5) If the operation succeeded, generate the intent log entry for it
 142  142   *      before dropping locks.  This ensures that the ordering of events
 143  143   *      in the intent log matches the order in which they actually occurred.
 144  144   *      During ZIL replay the zfs_log_* functions will update the sequence
 145  145   *      number to indicate the zil transaction has replayed.
 146  146   *
 147  147   *  (6) At the end of each vnode op, the DMU tx must always commit,
 148  148   *      regardless of whether there were any errors.
 149  149   *
 150  150   *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 151  151   *      to ensure that synchronous semantics are provided when necessary.
 152  152   *
 153  153   * In general, this is how things should be ordered in each vnode op:
 154  154   *
 155  155   *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 156  156   * top:
 157  157   *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 158  158   *      rw_enter(...);                  // grab any other locks you need
 159  159   *      tx = dmu_tx_create(...);        // get DMU tx
 160  160   *      dmu_tx_hold_*();                // hold each object you might modify
 161  161   *      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 162  162   *      if (error) {
 163  163   *              rw_exit(...);           // drop locks
 164  164   *              zfs_dirent_unlock(dl);  // unlock directory entry
 165  165   *              VN_RELE(...);           // release held vnodes
 166  166   *              if (error == ERESTART) {
 167  167   *                      waited = B_TRUE;
 168  168   *                      dmu_tx_wait(tx);
 169  169   *                      dmu_tx_abort(tx);
 170  170   *                      goto top;
 171  171   *              }
 172  172   *              dmu_tx_abort(tx);       // abort DMU tx
 173  173   *              ZFS_EXIT(zfsvfs);       // finished in zfs
 174  174   *              return (error);         // really out of space
 175  175   *      }
 176  176   *      error = do_real_work();         // do whatever this VOP does
 177  177   *      if (error == 0)
 178  178   *              zfs_log_*(...);         // on success, make ZIL entry
 179  179   *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 180  180   *      rw_exit(...);                   // drop locks
 181  181   *      zfs_dirent_unlock(dl);          // unlock directory entry
 182  182   *      VN_RELE(...);                   // release held vnodes
 183  183   *      zil_commit(zilog, foid);        // synchronous when necessary
 184  184   *      ZFS_EXIT(zfsvfs);               // finished in zfs
 185  185   *      return (error);                 // done, report error
 186  186   */
 187  187  
 188  188  /* ARGSUSED */
 189  189  static int
 190  190  zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 191  191  {
 192  192          znode_t *zp = VTOZ(*vpp);
 193  193          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 194  194  
 195  195          ZFS_ENTER(zfsvfs);
 196  196          ZFS_VERIFY_ZP(zp);
 197  197  
 198  198          if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 199  199              ((flag & FAPPEND) == 0)) {
 200  200                  ZFS_EXIT(zfsvfs);
 201  201                  return (SET_ERROR(EPERM));
 202  202          }
 203  203  
 204  204          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 205  205              ZTOV(zp)->v_type == VREG &&
 206  206              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 207  207                  if (fs_vscan(*vpp, cr, 0) != 0) {
 208  208                          ZFS_EXIT(zfsvfs);
 209  209                          return (SET_ERROR(EACCES));
 210  210                  }
 211  211          }
 212  212  
 213  213          /* Keep a count of the synchronous opens in the znode */
 214  214          if (flag & (FSYNC | FDSYNC))
 215  215                  atomic_inc_32(&zp->z_sync_cnt);
 216  216  
 217  217          ZFS_EXIT(zfsvfs);
 218  218          return (0);
 219  219  }
 220  220  
 221  221  /* ARGSUSED */
 222  222  static int
 223  223  zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 224  224      caller_context_t *ct)
 225  225  {
 226  226          znode_t *zp = VTOZ(vp);
 227  227          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 228  228  
 229  229          /*
 230  230           * Clean up any locks held by this process on the vp.
 231  231           */
 232  232          cleanlocks(vp, ddi_get_pid(), 0);
 233  233          cleanshares(vp, ddi_get_pid());
 234  234  
 235  235          ZFS_ENTER(zfsvfs);
 236  236          ZFS_VERIFY_ZP(zp);
 237  237  
 238  238          /* Decrement the synchronous opens in the znode */
 239  239          if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 240  240                  atomic_dec_32(&zp->z_sync_cnt);
 241  241  
 242  242          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 243  243              ZTOV(zp)->v_type == VREG &&
 244  244              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 245  245                  VERIFY(fs_vscan(vp, cr, 1) == 0);
 246  246  
 247  247          ZFS_EXIT(zfsvfs);
 248  248          return (0);
 249  249  }
 250  250  
 251  251  /*
 252  252   * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 253  253   * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 254  254   */
 255  255  static int
 256  256  zfs_holey(vnode_t *vp, int cmd, offset_t *off)
 257  257  {
 258  258          znode_t *zp = VTOZ(vp);
 259  259          uint64_t noff = (uint64_t)*off; /* new offset */
 260  260          uint64_t file_sz;
 261  261          int error;
 262  262          boolean_t hole;
 263  263  
 264  264          file_sz = zp->z_size;
 265  265          if (noff >= file_sz)  {
 266  266                  return (SET_ERROR(ENXIO));
 267  267          }
 268  268  
 269  269          if (cmd == _FIO_SEEK_HOLE)
 270  270                  hole = B_TRUE;
 271  271          else
 272  272                  hole = B_FALSE;
 273  273  
 274  274          error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 275  275  
 276  276          if (error == ESRCH)
 277  277                  return (SET_ERROR(ENXIO));
 278  278  
 279  279          /*
 280  280           * We could find a hole that begins after the logical end-of-file,
 281  281           * because dmu_offset_next() only works on whole blocks.  If the
 282  282           * EOF falls mid-block, then indicate that the "virtual hole"
 283  283           * at the end of the file begins at the logical EOF, rather than
 284  284           * at the end of the last block.
 285  285           */
 286  286          if (noff > file_sz) {
 287  287                  ASSERT(hole);
 288  288                  noff = file_sz;
 289  289          }
 290  290  
 291  291          if (noff < *off)
 292  292                  return (error);
 293  293          *off = noff;
 294  294          return (error);
 295  295  }
 296  296  
 297  297  /* ARGSUSED */
 298  298  static int
 299  299  zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 300  300      int *rvalp, caller_context_t *ct)
 301  301  {
 302  302          offset_t off;
 303  303          offset_t ndata;
 304  304          dmu_object_info_t doi;
 305  305          int error;
 306  306          zfsvfs_t *zfsvfs;
 307  307          znode_t *zp;
 308  308  
 309  309          switch (com) {
 310  310          case _FIOFFS:
 311  311          {
 312  312                  return (zfs_sync(vp->v_vfsp, 0, cred));
 313  313  
 314  314                  /*
 315  315                   * The following two ioctls are used by bfu.  Faking out,
 316  316                   * necessary to avoid bfu errors.
 317  317                   */
 318  318          }
 319  319          case _FIOGDIO:
 320  320          case _FIOSDIO:
 321  321          {
 322  322                  return (0);
 323  323          }
 324  324  
 325  325          case _FIO_SEEK_DATA:
 326  326          case _FIO_SEEK_HOLE:
 327  327          {
 328  328                  if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 329  329                          return (SET_ERROR(EFAULT));
 330  330  
 331  331                  zp = VTOZ(vp);
 332  332                  zfsvfs = zp->z_zfsvfs;
 333  333                  ZFS_ENTER(zfsvfs);
 334  334                  ZFS_VERIFY_ZP(zp);
 335  335  
 336  336                  /* offset parameter is in/out */
 337  337                  error = zfs_holey(vp, com, &off);
 338  338                  ZFS_EXIT(zfsvfs);
 339  339                  if (error)
 340  340                          return (error);
 341  341                  if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 342  342                          return (SET_ERROR(EFAULT));
 343  343                  return (0);
 344  344          }
 345  345          case _FIO_COUNT_FILLED:
 346  346          {
 347  347                  /*
 348  348                   * _FIO_COUNT_FILLED adds a new ioctl command which
 349  349                   * exposes the number of filled blocks in a
 350  350                   * ZFS object.
 351  351                   */
 352  352                  zp = VTOZ(vp);
 353  353                  zfsvfs = zp->z_zfsvfs;
 354  354                  ZFS_ENTER(zfsvfs);
 355  355                  ZFS_VERIFY_ZP(zp);
 356  356  
 357  357                  /*
 358  358                   * Wait for all dirty blocks for this object
 359  359                   * to get synced out to disk, and the DMU info
 360  360                   * updated.
 361  361                   */
 362  362                  error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 363  363                  if (error) {
 364  364                          ZFS_EXIT(zfsvfs);
 365  365                          return (error);
 366  366                  }
 367  367  
 368  368                  /*
 369  369                   * Retrieve fill count from DMU object.
 370  370                   */
 371  371                  error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 372  372                  if (error) {
 373  373                          ZFS_EXIT(zfsvfs);
 374  374                          return (error);
 375  375                  }
 376  376  
 377  377                  ndata = doi.doi_fill_count;
 378  378  
 379  379                  ZFS_EXIT(zfsvfs);
 380  380                  if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 381  381                          return (SET_ERROR(EFAULT));
 382  382                  return (0);
 383  383          }
 384  384          }
 385  385          return (SET_ERROR(ENOTTY));
 386  386  }
 387  387  
 388  388  /*
 389  389   * Utility functions to map and unmap a single physical page.  These
 390  390   * are used to manage the mappable copies of ZFS file data, and therefore
 391  391   * do not update ref/mod bits.
 392  392   */
 393  393  caddr_t
 394  394  zfs_map_page(page_t *pp, enum seg_rw rw)
 395  395  {
 396  396          if (kpm_enable)
 397  397                  return (hat_kpm_mapin(pp, 0));
 398  398          ASSERT(rw == S_READ || rw == S_WRITE);
 399  399          return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 400  400              (caddr_t)-1));
 401  401  }
 402  402  
 403  403  void
 404  404  zfs_unmap_page(page_t *pp, caddr_t addr)
 405  405  {
 406  406          if (kpm_enable) {
 407  407                  hat_kpm_mapout(pp, 0, addr);
 408  408          } else {
 409  409                  ppmapout(addr);
 410  410          }
 411  411  }
 412  412  
 413  413  /*
 414  414   * When a file is memory mapped, we must keep the IO data synchronized
 415  415   * between the DMU cache and the memory mapped pages.  What this means:
 416  416   *
 417  417   * On Write:    If we find a memory mapped page, we write to *both*
 418  418   *              the page and the dmu buffer.
 419  419   */
 420  420  static void
 421  421  update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 422  422  {
 423  423          int64_t off;
 424  424  
 425  425          off = start & PAGEOFFSET;
 426  426          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 427  427                  page_t *pp;
 428  428                  uint64_t nbytes = MIN(PAGESIZE - off, len);
 429  429  
 430  430                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 431  431                          caddr_t va;
 432  432  
 433  433                          va = zfs_map_page(pp, S_WRITE);
 434  434                          (void) dmu_read(os, oid, start+off, nbytes, va+off,
 435  435                              DMU_READ_PREFETCH);
 436  436                          zfs_unmap_page(pp, va);
 437  437                          page_unlock(pp);
 438  438                  }
 439  439                  len -= nbytes;
 440  440                  off = 0;
 441  441          }
 442  442  }
 443  443  
 444  444  /*
 445  445   * When a file is memory mapped, we must keep the IO data synchronized
 446  446   * between the DMU cache and the memory mapped pages.  What this means:
 447  447   *
 448  448   * On Read:     We "read" preferentially from memory mapped pages,
 449  449   *              else we default from the dmu buffer.
 450  450   *
 451  451   * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 452  452   *       the file is memory mapped.
 453  453   */
 454  454  static int
 455  455  mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 456  456  {
 457  457          znode_t *zp = VTOZ(vp);
 458  458          int64_t start, off;
 459  459          int len = nbytes;
 460  460          int error = 0;
 461  461  
 462  462          start = uio->uio_loffset;
 463  463          off = start & PAGEOFFSET;
 464  464          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 465  465                  page_t *pp;
 466  466                  uint64_t bytes = MIN(PAGESIZE - off, len);
 467  467  
 468  468                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 469  469                          caddr_t va;
 470  470  
 471  471                          va = zfs_map_page(pp, S_READ);
 472  472                          error = uiomove(va + off, bytes, UIO_READ, uio);
 473  473                          zfs_unmap_page(pp, va);
 474  474                          page_unlock(pp);
 475  475                  } else {
 476  476                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 477  477                              uio, bytes);
 478  478                  }
 479  479                  len -= bytes;
 480  480                  off = 0;
 481  481                  if (error)
 482  482                          break;
 483  483          }
 484  484          return (error);
 485  485  }
 486  486  
 487  487  offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 488  488  
 489  489  /*
 490  490   * Read bytes from specified file into supplied buffer.
 491  491   *
 492  492   *      IN:     vp      - vnode of file to be read from.
 493  493   *              uio     - structure supplying read location, range info,
 494  494   *                        and return buffer.
 495  495   *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 496  496   *              cr      - credentials of caller.
 497  497   *              ct      - caller context
 498  498   *
 499  499   *      OUT:    uio     - updated offset and range, buffer filled.
 500  500   *
 501  501   *      RETURN: 0 on success, error code on failure.
 502  502   *
 503  503   * Side Effects:
 504  504   *      vp - atime updated if byte count > 0
 505  505   */
 506  506  /* ARGSUSED */
 507  507  static int
 508  508  zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 509  509  {
 510  510          znode_t         *zp = VTOZ(vp);
 511  511          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 512  512          ssize_t         n, nbytes;
 513  513          int             error = 0;
 514  514          rl_t            *rl;
 515  515          xuio_t          *xuio = NULL;
 516  516  
 517  517          ZFS_ENTER(zfsvfs);
 518  518          ZFS_VERIFY_ZP(zp);
 519  519  
 520  520          if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 521  521                  ZFS_EXIT(zfsvfs);
 522  522                  return (SET_ERROR(EACCES));
 523  523          }
 524  524  
 525  525          /*
 526  526           * Validate file offset
 527  527           */
 528  528          if (uio->uio_loffset < (offset_t)0) {
 529  529                  ZFS_EXIT(zfsvfs);
 530  530                  return (SET_ERROR(EINVAL));
 531  531          }
 532  532  
 533  533          /*
 534  534           * Fasttrack empty reads
 535  535           */
 536  536          if (uio->uio_resid == 0) {
 537  537                  ZFS_EXIT(zfsvfs);
 538  538                  return (0);
 539  539          }
 540  540  
 541  541          /*
 542  542           * Check for mandatory locks
 543  543           */
 544  544          if (MANDMODE(zp->z_mode)) {
 545  545                  if (error = chklock(vp, FREAD,
 546  546                      uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 547  547                          ZFS_EXIT(zfsvfs);
 548  548                          return (error);
 549  549                  }
 550  550          }
 551  551  
 552  552          /*
 553  553           * If we're in FRSYNC mode, sync out this znode before reading it.
 554  554           */
 555  555          if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 556  556                  zil_commit(zfsvfs->z_log, zp->z_id);
 557  557  
 558  558          /*
 559  559           * Lock the range against changes.
 560  560           */
 561  561          rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 562  562  
 563  563          /*
 564  564           * If we are reading past end-of-file we can skip
 565  565           * to the end; but we might still need to set atime.
 566  566           */
 567  567          if (uio->uio_loffset >= zp->z_size) {
 568  568                  error = 0;
 569  569                  goto out;
 570  570          }
 571  571  
 572  572          ASSERT(uio->uio_loffset < zp->z_size);
 573  573          n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 574  574  
 575  575          if ((uio->uio_extflg == UIO_XUIO) &&
 576  576              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 577  577                  int nblk;
 578  578                  int blksz = zp->z_blksz;
 579  579                  uint64_t offset = uio->uio_loffset;
 580  580  
 581  581                  xuio = (xuio_t *)uio;
 582  582                  if ((ISP2(blksz))) {
 583  583                          nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 584  584                              blksz)) / blksz;
 585  585                  } else {
 586  586                          ASSERT(offset + n <= blksz);
 587  587                          nblk = 1;
 588  588                  }
 589  589                  (void) dmu_xuio_init(xuio, nblk);
 590  590  
 591  591                  if (vn_has_cached_data(vp)) {
 592  592                          /*
 593  593                           * For simplicity, we always allocate a full buffer
 594  594                           * even if we only expect to read a portion of a block.
 595  595                           */
 596  596                          while (--nblk >= 0) {
 597  597                                  (void) dmu_xuio_add(xuio,
 598  598                                      dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 599  599                                      blksz), 0, blksz);
 600  600                          }
 601  601                  }
 602  602          }
 603  603  
 604  604          while (n > 0) {
 605  605                  nbytes = MIN(n, zfs_read_chunk_size -
 606  606                      P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 607  607  
 608  608                  if (vn_has_cached_data(vp)) {
 609  609                          error = mappedread(vp, nbytes, uio);
 610  610                  } else {
 611  611                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 612  612                              uio, nbytes);
 613  613                  }
 614  614                  if (error) {
 615  615                          /* convert checksum errors into IO errors */
 616  616                          if (error == ECKSUM)
 617  617                                  error = SET_ERROR(EIO);
 618  618                          break;
 619  619                  }
 620  620  
 621  621                  n -= nbytes;
 622  622          }
 623  623  out:
 624  624          zfs_range_unlock(rl);
 625  625  
 626  626          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 627  627          ZFS_EXIT(zfsvfs);
 628  628          return (error);
 629  629  }
 630  630  
 631  631  /*
 632  632   * Write the bytes to a file.
 633  633   *
 634  634   *      IN:     vp      - vnode of file to be written to.
 635  635   *              uio     - structure supplying write location, range info,
 636  636   *                        and data buffer.
 637  637   *              ioflag  - FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
 638  638   *                        set if in append mode.
 639  639   *              cr      - credentials of caller.
 640  640   *              ct      - caller context (NFS/CIFS fem monitor only)
 641  641   *
 642  642   *      OUT:    uio     - updated offset and range.
 643  643   *
 644  644   *      RETURN: 0 on success, error code on failure.
 645  645   *
 646  646   * Timestamps:
 647  647   *      vp - ctime|mtime updated if byte count > 0
 648  648   */
 649  649  
 650  650  /* ARGSUSED */
 651  651  static int
 652  652  zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 653  653  {
 654  654          znode_t         *zp = VTOZ(vp);
 655  655          rlim64_t        limit = uio->uio_llimit;
 656  656          ssize_t         start_resid = uio->uio_resid;
  
    | 
      ↓ open down ↓ | 
    620 lines elided | 
    
      ↑ open up ↑ | 
  
 657  657          ssize_t         tx_bytes;
 658  658          uint64_t        end_size;
 659  659          dmu_tx_t        *tx;
 660  660          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 661  661          zilog_t         *zilog;
 662  662          offset_t        woff;
 663  663          ssize_t         n, nbytes;
 664  664          rl_t            *rl;
 665  665          int             max_blksz = zfsvfs->z_max_blksz;
 666  666          int             error = 0;
      667 +        int             prev_error;
 667  668          arc_buf_t       *abuf;
 668  669          iovec_t         *aiov = NULL;
 669  670          xuio_t          *xuio = NULL;
 670  671          int             i_iov = 0;
 671  672          int             iovcnt = uio->uio_iovcnt;
 672  673          iovec_t         *iovp = uio->uio_iov;
 673  674          int             write_eof;
 674  675          int             count = 0;
 675  676          sa_bulk_attr_t  bulk[4];
 676  677          uint64_t        mtime[2], ctime[2];
 677  678  
 678  679          /*
 679  680           * Fasttrack empty write
 680  681           */
 681  682          n = start_resid;
 682  683          if (n == 0)
 683  684                  return (0);
 684  685  
 685  686          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 686  687                  limit = MAXOFFSET_T;
 687  688  
 688  689          ZFS_ENTER(zfsvfs);
 689  690          ZFS_VERIFY_ZP(zp);
 690  691  
 691  692          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 692  693          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 693  694          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 694  695              &zp->z_size, 8);
 695  696          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 696  697              &zp->z_pflags, 8);
 697  698  
 698  699          /*
 699  700           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 700  701           * callers might not be able to detect properly that we are read-only,
 701  702           * so check it explicitly here.
 702  703           */
 703  704          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 704  705                  ZFS_EXIT(zfsvfs);
 705  706                  return (SET_ERROR(EROFS));
 706  707          }
 707  708  
 708  709          /*
 709  710           * If immutable or not appending then return EPERM
 710  711           */
 711  712          if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 712  713              ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 713  714              (uio->uio_loffset < zp->z_size))) {
 714  715                  ZFS_EXIT(zfsvfs);
 715  716                  return (SET_ERROR(EPERM));
 716  717          }
 717  718  
 718  719          zilog = zfsvfs->z_log;
 719  720  
 720  721          /*
 721  722           * Validate file offset
 722  723           */
 723  724          woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 724  725          if (woff < 0) {
 725  726                  ZFS_EXIT(zfsvfs);
 726  727                  return (SET_ERROR(EINVAL));
 727  728          }
 728  729  
 729  730          /*
 730  731           * Check for mandatory locks before calling zfs_range_lock()
 731  732           * in order to prevent a deadlock with locks set via fcntl().
 732  733           */
 733  734          if (MANDMODE((mode_t)zp->z_mode) &&
 734  735              (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 735  736                  ZFS_EXIT(zfsvfs);
 736  737                  return (error);
 737  738          }
 738  739  
 739  740          /*
 740  741           * Pre-fault the pages to ensure slow (eg NFS) pages
 741  742           * don't hold up txg.
 742  743           * Skip this if uio contains loaned arc_buf.
 743  744           */
 744  745          if ((uio->uio_extflg == UIO_XUIO) &&
 745  746              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 746  747                  xuio = (xuio_t *)uio;
 747  748          else
 748  749                  uio_prefaultpages(MIN(n, max_blksz), uio);
 749  750  
 750  751          /*
 751  752           * If in append mode, set the io offset pointer to eof.
 752  753           */
 753  754          if (ioflag & FAPPEND) {
 754  755                  /*
 755  756                   * Obtain an appending range lock to guarantee file append
 756  757                   * semantics.  We reset the write offset once we have the lock.
 757  758                   */
 758  759                  rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 759  760                  woff = rl->r_off;
 760  761                  if (rl->r_len == UINT64_MAX) {
 761  762                          /*
 762  763                           * We overlocked the file because this write will cause
 763  764                           * the file block size to increase.
 764  765                           * Note that zp_size cannot change with this lock held.
 765  766                           */
 766  767                          woff = zp->z_size;
 767  768                  }
 768  769                  uio->uio_loffset = woff;
 769  770          } else {
 770  771                  /*
 771  772                   * Note that if the file block size will change as a result of
 772  773                   * this write, then this range lock will lock the entire file
 773  774                   * so that we can re-write the block safely.
 774  775                   */
 775  776                  rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 776  777          }
 777  778  
 778  779          if (woff >= limit) {
 779  780                  zfs_range_unlock(rl);
 780  781                  ZFS_EXIT(zfsvfs);
 781  782                  return (SET_ERROR(EFBIG));
 782  783          }
 783  784  
 784  785          if ((woff + n) > limit || woff > (limit - n))
 785  786                  n = limit - woff;
 786  787  
 787  788          /* Will this write extend the file length? */
 788  789          write_eof = (woff + n > zp->z_size);
 789  790  
 790  791          end_size = MAX(zp->z_size, woff + n);
 791  792  
 792  793          /*
 793  794           * Write the file in reasonable size chunks.  Each chunk is written
 794  795           * in a separate transaction; this keeps the intent log records small
 795  796           * and allows us to do more fine-grained space accounting.
 796  797           */
 797  798          while (n > 0) {
 798  799                  abuf = NULL;
 799  800                  woff = uio->uio_loffset;
 800  801                  if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 801  802                      zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 802  803                          if (abuf != NULL)
 803  804                                  dmu_return_arcbuf(abuf);
 804  805                          error = SET_ERROR(EDQUOT);
 805  806                          break;
 806  807                  }
 807  808  
 808  809                  if (xuio && abuf == NULL) {
 809  810                          ASSERT(i_iov < iovcnt);
 810  811                          aiov = &iovp[i_iov];
 811  812                          abuf = dmu_xuio_arcbuf(xuio, i_iov);
 812  813                          dmu_xuio_clear(xuio, i_iov);
 813  814                          DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 814  815                              iovec_t *, aiov, arc_buf_t *, abuf);
 815  816                          ASSERT((aiov->iov_base == abuf->b_data) ||
 816  817                              ((char *)aiov->iov_base - (char *)abuf->b_data +
 817  818                              aiov->iov_len == arc_buf_size(abuf)));
 818  819                          i_iov++;
 819  820                  } else if (abuf == NULL && n >= max_blksz &&
 820  821                      woff >= zp->z_size &&
 821  822                      P2PHASE(woff, max_blksz) == 0 &&
 822  823                      zp->z_blksz == max_blksz) {
 823  824                          /*
 824  825                           * This write covers a full block.  "Borrow" a buffer
 825  826                           * from the dmu so that we can fill it before we enter
 826  827                           * a transaction.  This avoids the possibility of
 827  828                           * holding up the transaction if the data copy hangs
 828  829                           * up on a pagefault (e.g., from an NFS server mapping).
 829  830                           */
 830  831                          size_t cbytes;
 831  832  
 832  833                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 833  834                              max_blksz);
 834  835                          ASSERT(abuf != NULL);
 835  836                          ASSERT(arc_buf_size(abuf) == max_blksz);
 836  837                          if (error = uiocopy(abuf->b_data, max_blksz,
 837  838                              UIO_WRITE, uio, &cbytes)) {
 838  839                                  dmu_return_arcbuf(abuf);
 839  840                                  break;
 840  841                          }
 841  842                          ASSERT(cbytes == max_blksz);
 842  843                  }
 843  844  
 844  845                  /*
 845  846                   * Start a transaction.
 846  847                   */
 847  848                  tx = dmu_tx_create(zfsvfs->z_os);
 848  849                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 849  850                  dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 850  851                  zfs_sa_upgrade_txholds(tx, zp);
 851  852                  error = dmu_tx_assign(tx, TXG_WAIT);
 852  853                  if (error) {
 853  854                          dmu_tx_abort(tx);
 854  855                          if (abuf != NULL)
 855  856                                  dmu_return_arcbuf(abuf);
 856  857                          break;
 857  858                  }
 858  859  
 859  860                  /*
 860  861                   * If zfs_range_lock() over-locked we grow the blocksize
 861  862                   * and then reduce the lock range.  This will only happen
 862  863                   * on the first iteration since zfs_range_reduce() will
 863  864                   * shrink down r_len to the appropriate size.
 864  865                   */
 865  866                  if (rl->r_len == UINT64_MAX) {
 866  867                          uint64_t new_blksz;
 867  868  
 868  869                          if (zp->z_blksz > max_blksz) {
 869  870                                  /*
 870  871                                   * File's blocksize is already larger than the
 871  872                                   * "recordsize" property.  Only let it grow to
 872  873                                   * the next power of 2.
 873  874                                   */
 874  875                                  ASSERT(!ISP2(zp->z_blksz));
 875  876                                  new_blksz = MIN(end_size,
 876  877                                      1 << highbit64(zp->z_blksz));
 877  878                          } else {
 878  879                                  new_blksz = MIN(end_size, max_blksz);
 879  880                          }
 880  881                          zfs_grow_blocksize(zp, new_blksz, tx);
 881  882                          zfs_range_reduce(rl, woff, n);
 882  883                  }
 883  884  
 884  885                  /*
 885  886                   * XXX - should we really limit each write to z_max_blksz?
 886  887                   * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 887  888                   */
 888  889                  nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 889  890  
 890  891                  if (abuf == NULL) {
 891  892                          tx_bytes = uio->uio_resid;
 892  893                          error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 893  894                              uio, nbytes, tx);
 894  895                          tx_bytes -= uio->uio_resid;
 895  896                  } else {
 896  897                          tx_bytes = nbytes;
 897  898                          ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 898  899                          /*
 899  900                           * If this is not a full block write, but we are
 900  901                           * extending the file past EOF and this data starts
 901  902                           * block-aligned, use assign_arcbuf().  Otherwise,
 902  903                           * write via dmu_write().
 903  904                           */
 904  905                          if (tx_bytes < max_blksz && (!write_eof ||
 905  906                              aiov->iov_base != abuf->b_data)) {
 906  907                                  ASSERT(xuio);
 907  908                                  dmu_write(zfsvfs->z_os, zp->z_id, woff,
 908  909                                      aiov->iov_len, aiov->iov_base, tx);
 909  910                                  dmu_return_arcbuf(abuf);
 910  911                                  xuio_stat_wbuf_copied();
 911  912                          } else {
 912  913                                  ASSERT(xuio || tx_bytes == max_blksz);
 913  914                                  dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 914  915                                      woff, abuf, tx);
 915  916                          }
 916  917                          ASSERT(tx_bytes <= uio->uio_resid);
 917  918                          uioskip(uio, tx_bytes);
 918  919                  }
 919  920                  if (tx_bytes && vn_has_cached_data(vp)) {
 920  921                          update_pages(vp, woff,
 921  922                              tx_bytes, zfsvfs->z_os, zp->z_id);
 922  923                  }
 923  924  
 924  925                  /*
 925  926                   * If we made no progress, we're done.  If we made even
 926  927                   * partial progress, update the znode and ZIL accordingly.
 927  928                   */
 928  929                  if (tx_bytes == 0) {
 929  930                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 930  931                              (void *)&zp->z_size, sizeof (uint64_t), tx);
 931  932                          dmu_tx_commit(tx);
 932  933                          ASSERT(error != 0);
 933  934                          break;
 934  935                  }
 935  936  
 936  937                  /*
 937  938                   * Clear Set-UID/Set-GID bits on successful write if not
 938  939                   * privileged and at least one of the excute bits is set.
 939  940                   *
 940  941                   * It would be nice to to this after all writes have
 941  942                   * been done, but that would still expose the ISUID/ISGID
 942  943                   * to another app after the partial write is committed.
 943  944                   *
 944  945                   * Note: we don't call zfs_fuid_map_id() here because
 945  946                   * user 0 is not an ephemeral uid.
 946  947                   */
 947  948                  mutex_enter(&zp->z_acl_lock);
 948  949                  if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 949  950                      (S_IXUSR >> 6))) != 0 &&
 950  951                      (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 951  952                      secpolicy_vnode_setid_retain(cr,
 952  953                      (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 953  954                          uint64_t newmode;
 954  955                          zp->z_mode &= ~(S_ISUID | S_ISGID);
 955  956                          newmode = zp->z_mode;
 956  957                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 957  958                              (void *)&newmode, sizeof (uint64_t), tx);
 958  959                  }
 959  960                  mutex_exit(&zp->z_acl_lock);
 960  961  
  
    | 
      ↓ open down ↓ | 
    284 lines elided | 
    
      ↑ open up ↑ | 
  
 961  962                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 962  963                      B_TRUE);
 963  964  
 964  965                  /*
 965  966                   * Update the file size (zp_size) if it has changed;
 966  967                   * account for possible concurrent updates.
 967  968                   */
 968  969                  while ((end_size = zp->z_size) < uio->uio_loffset) {
 969  970                          (void) atomic_cas_64(&zp->z_size, end_size,
 970  971                              uio->uio_loffset);
 971      -                        ASSERT(error == 0);
 972  972                  }
 973  973                  /*
 974  974                   * If we are replaying and eof is non zero then force
 975  975                   * the file size to the specified eof. Note, there's no
 976  976                   * concurrency during replay.
 977  977                   */
 978  978                  if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 979  979                          zp->z_size = zfsvfs->z_replay_eof;
 980  980  
      981 +                /*
      982 +                 * Keep track of a possible pre-existing error from a partial
      983 +                 * write via dmu_write_uio_dbuf above.
      984 +                 */
      985 +                prev_error = error;
 981  986                  error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 982  987  
 983  988                  zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 984  989                  dmu_tx_commit(tx);
 985  990  
 986      -                if (error != 0)
      991 +                if (prev_error != 0 || error != 0)
 987  992                          break;
 988  993                  ASSERT(tx_bytes == nbytes);
 989  994                  n -= nbytes;
 990  995  
 991  996                  if (!xuio && n > 0)
 992  997                          uio_prefaultpages(MIN(n, max_blksz), uio);
 993  998          }
 994  999  
 995 1000          zfs_range_unlock(rl);
 996 1001  
 997 1002          /*
 998 1003           * If we're in replay mode, or we made no progress, return error.
 999 1004           * Otherwise, it's at least a partial write, so it's successful.
1000 1005           */
1001 1006          if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1002 1007                  ZFS_EXIT(zfsvfs);
1003 1008                  return (error);
1004 1009          }
1005 1010  
1006 1011          if (ioflag & (FSYNC | FDSYNC) ||
1007 1012              zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1008 1013                  zil_commit(zilog, zp->z_id);
1009 1014  
1010 1015          ZFS_EXIT(zfsvfs);
1011 1016          return (0);
1012 1017  }
1013 1018  
1014 1019  void
1015 1020  zfs_get_done(zgd_t *zgd, int error)
1016 1021  {
1017 1022          znode_t *zp = zgd->zgd_private;
1018 1023          objset_t *os = zp->z_zfsvfs->z_os;
1019 1024  
1020 1025          if (zgd->zgd_db)
1021 1026                  dmu_buf_rele(zgd->zgd_db, zgd);
1022 1027  
1023 1028          zfs_range_unlock(zgd->zgd_rl);
1024 1029  
1025 1030          /*
1026 1031           * Release the vnode asynchronously as we currently have the
1027 1032           * txg stopped from syncing.
1028 1033           */
1029 1034          VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1030 1035  
1031 1036          if (error == 0 && zgd->zgd_bp)
1032 1037                  zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1033 1038  
1034 1039          kmem_free(zgd, sizeof (zgd_t));
1035 1040  }
1036 1041  
1037 1042  #ifdef DEBUG
1038 1043  static int zil_fault_io = 0;
1039 1044  #endif
1040 1045  
1041 1046  /*
1042 1047   * Get data to generate a TX_WRITE intent log record.
1043 1048   */
1044 1049  int
1045 1050  zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1046 1051  {
1047 1052          zfsvfs_t *zfsvfs = arg;
1048 1053          objset_t *os = zfsvfs->z_os;
1049 1054          znode_t *zp;
1050 1055          uint64_t object = lr->lr_foid;
1051 1056          uint64_t offset = lr->lr_offset;
1052 1057          uint64_t size = lr->lr_length;
1053 1058          blkptr_t *bp = &lr->lr_blkptr;
1054 1059          dmu_buf_t *db;
1055 1060          zgd_t *zgd;
1056 1061          int error = 0;
1057 1062  
1058 1063          ASSERT(zio != NULL);
1059 1064          ASSERT(size != 0);
1060 1065  
1061 1066          /*
1062 1067           * Nothing to do if the file has been removed
1063 1068           */
1064 1069          if (zfs_zget(zfsvfs, object, &zp) != 0)
1065 1070                  return (SET_ERROR(ENOENT));
1066 1071          if (zp->z_unlinked) {
1067 1072                  /*
1068 1073                   * Release the vnode asynchronously as we currently have the
1069 1074                   * txg stopped from syncing.
1070 1075                   */
1071 1076                  VN_RELE_ASYNC(ZTOV(zp),
1072 1077                      dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1073 1078                  return (SET_ERROR(ENOENT));
1074 1079          }
1075 1080  
1076 1081          zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1077 1082          zgd->zgd_zilog = zfsvfs->z_log;
1078 1083          zgd->zgd_private = zp;
1079 1084  
1080 1085          /*
1081 1086           * Write records come in two flavors: immediate and indirect.
1082 1087           * For small writes it's cheaper to store the data with the
1083 1088           * log record (immediate); for large writes it's cheaper to
1084 1089           * sync the data and get a pointer to it (indirect) so that
1085 1090           * we don't have to write the data twice.
1086 1091           */
1087 1092          if (buf != NULL) { /* immediate write */
1088 1093                  zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1089 1094                  /* test for truncation needs to be done while range locked */
1090 1095                  if (offset >= zp->z_size) {
1091 1096                          error = SET_ERROR(ENOENT);
1092 1097                  } else {
1093 1098                          error = dmu_read(os, object, offset, size, buf,
1094 1099                              DMU_READ_NO_PREFETCH);
1095 1100                  }
1096 1101                  ASSERT(error == 0 || error == ENOENT);
1097 1102          } else { /* indirect write */
1098 1103                  /*
1099 1104                   * Have to lock the whole block to ensure when it's
1100 1105                   * written out and it's checksum is being calculated
1101 1106                   * that no one can change the data. We need to re-check
1102 1107                   * blocksize after we get the lock in case it's changed!
1103 1108                   */
1104 1109                  for (;;) {
1105 1110                          uint64_t blkoff;
1106 1111                          size = zp->z_blksz;
1107 1112                          blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1108 1113                          offset -= blkoff;
1109 1114                          zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1110 1115                              RL_READER);
1111 1116                          if (zp->z_blksz == size)
1112 1117                                  break;
1113 1118                          offset += blkoff;
1114 1119                          zfs_range_unlock(zgd->zgd_rl);
1115 1120                  }
1116 1121                  /* test for truncation needs to be done while range locked */
1117 1122                  if (lr->lr_offset >= zp->z_size)
1118 1123                          error = SET_ERROR(ENOENT);
1119 1124  #ifdef DEBUG
1120 1125                  if (zil_fault_io) {
1121 1126                          error = SET_ERROR(EIO);
1122 1127                          zil_fault_io = 0;
1123 1128                  }
1124 1129  #endif
1125 1130                  if (error == 0)
1126 1131                          error = dmu_buf_hold(os, object, offset, zgd, &db,
1127 1132                              DMU_READ_NO_PREFETCH);
1128 1133  
1129 1134                  if (error == 0) {
1130 1135                          blkptr_t *obp = dmu_buf_get_blkptr(db);
1131 1136                          if (obp) {
1132 1137                                  ASSERT(BP_IS_HOLE(bp));
1133 1138                                  *bp = *obp;
1134 1139                          }
1135 1140  
1136 1141                          zgd->zgd_db = db;
1137 1142                          zgd->zgd_bp = bp;
1138 1143  
1139 1144                          ASSERT(db->db_offset == offset);
1140 1145                          ASSERT(db->db_size == size);
1141 1146  
1142 1147                          error = dmu_sync(zio, lr->lr_common.lrc_txg,
1143 1148                              zfs_get_done, zgd);
1144 1149                          ASSERT(error || lr->lr_length <= zp->z_blksz);
1145 1150  
1146 1151                          /*
1147 1152                           * On success, we need to wait for the write I/O
1148 1153                           * initiated by dmu_sync() to complete before we can
1149 1154                           * release this dbuf.  We will finish everything up
1150 1155                           * in the zfs_get_done() callback.
1151 1156                           */
1152 1157                          if (error == 0)
1153 1158                                  return (0);
1154 1159  
1155 1160                          if (error == EALREADY) {
1156 1161                                  lr->lr_common.lrc_txtype = TX_WRITE2;
1157 1162                                  error = 0;
1158 1163                          }
1159 1164                  }
1160 1165          }
1161 1166  
1162 1167          zfs_get_done(zgd, error);
1163 1168  
1164 1169          return (error);
1165 1170  }
1166 1171  
1167 1172  /*ARGSUSED*/
1168 1173  static int
1169 1174  zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1170 1175      caller_context_t *ct)
1171 1176  {
1172 1177          znode_t *zp = VTOZ(vp);
1173 1178          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1174 1179          int error;
1175 1180  
1176 1181          ZFS_ENTER(zfsvfs);
1177 1182          ZFS_VERIFY_ZP(zp);
1178 1183  
1179 1184          if (flag & V_ACE_MASK)
1180 1185                  error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1181 1186          else
1182 1187                  error = zfs_zaccess_rwx(zp, mode, flag, cr);
1183 1188  
1184 1189          ZFS_EXIT(zfsvfs);
1185 1190          return (error);
1186 1191  }
1187 1192  
1188 1193  /*
1189 1194   * If vnode is for a device return a specfs vnode instead.
1190 1195   */
1191 1196  static int
1192 1197  specvp_check(vnode_t **vpp, cred_t *cr)
1193 1198  {
1194 1199          int error = 0;
1195 1200  
1196 1201          if (IS_DEVVP(*vpp)) {
1197 1202                  struct vnode *svp;
1198 1203  
1199 1204                  svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1200 1205                  VN_RELE(*vpp);
1201 1206                  if (svp == NULL)
1202 1207                          error = SET_ERROR(ENOSYS);
1203 1208                  *vpp = svp;
1204 1209          }
1205 1210          return (error);
1206 1211  }
1207 1212  
1208 1213  
1209 1214  /*
1210 1215   * Lookup an entry in a directory, or an extended attribute directory.
1211 1216   * If it exists, return a held vnode reference for it.
1212 1217   *
1213 1218   *      IN:     dvp     - vnode of directory to search.
1214 1219   *              nm      - name of entry to lookup.
1215 1220   *              pnp     - full pathname to lookup [UNUSED].
1216 1221   *              flags   - LOOKUP_XATTR set if looking for an attribute.
1217 1222   *              rdir    - root directory vnode [UNUSED].
1218 1223   *              cr      - credentials of caller.
1219 1224   *              ct      - caller context
1220 1225   *              direntflags - directory lookup flags
1221 1226   *              realpnp - returned pathname.
1222 1227   *
1223 1228   *      OUT:    vpp     - vnode of located entry, NULL if not found.
1224 1229   *
1225 1230   *      RETURN: 0 on success, error code on failure.
1226 1231   *
1227 1232   * Timestamps:
1228 1233   *      NA
1229 1234   */
1230 1235  /* ARGSUSED */
1231 1236  static int
1232 1237  zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1233 1238      int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1234 1239      int *direntflags, pathname_t *realpnp)
1235 1240  {
1236 1241          znode_t *zdp = VTOZ(dvp);
1237 1242          zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1238 1243          int     error = 0;
1239 1244  
1240 1245          /* fast path */
1241 1246          if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1242 1247  
1243 1248                  if (dvp->v_type != VDIR) {
1244 1249                          return (SET_ERROR(ENOTDIR));
1245 1250                  } else if (zdp->z_sa_hdl == NULL) {
1246 1251                          return (SET_ERROR(EIO));
1247 1252                  }
1248 1253  
1249 1254                  if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1250 1255                          error = zfs_fastaccesschk_execute(zdp, cr);
1251 1256                          if (!error) {
1252 1257                                  *vpp = dvp;
1253 1258                                  VN_HOLD(*vpp);
1254 1259                                  return (0);
1255 1260                          }
1256 1261                          return (error);
1257 1262                  } else {
1258 1263                          vnode_t *tvp = dnlc_lookup(dvp, nm);
1259 1264  
1260 1265                          if (tvp) {
1261 1266                                  error = zfs_fastaccesschk_execute(zdp, cr);
1262 1267                                  if (error) {
1263 1268                                          VN_RELE(tvp);
1264 1269                                          return (error);
1265 1270                                  }
1266 1271                                  if (tvp == DNLC_NO_VNODE) {
1267 1272                                          VN_RELE(tvp);
1268 1273                                          return (SET_ERROR(ENOENT));
1269 1274                                  } else {
1270 1275                                          *vpp = tvp;
1271 1276                                          return (specvp_check(vpp, cr));
1272 1277                                  }
1273 1278                          }
1274 1279                  }
1275 1280          }
1276 1281  
1277 1282          DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1278 1283  
1279 1284          ZFS_ENTER(zfsvfs);
1280 1285          ZFS_VERIFY_ZP(zdp);
1281 1286  
1282 1287          *vpp = NULL;
1283 1288  
1284 1289          if (flags & LOOKUP_XATTR) {
1285 1290                  /*
1286 1291                   * If the xattr property is off, refuse the lookup request.
1287 1292                   */
1288 1293                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1289 1294                          ZFS_EXIT(zfsvfs);
1290 1295                          return (SET_ERROR(EINVAL));
1291 1296                  }
1292 1297  
1293 1298                  /*
1294 1299                   * We don't allow recursive attributes..
1295 1300                   * Maybe someday we will.
1296 1301                   */
1297 1302                  if (zdp->z_pflags & ZFS_XATTR) {
1298 1303                          ZFS_EXIT(zfsvfs);
1299 1304                          return (SET_ERROR(EINVAL));
1300 1305                  }
1301 1306  
1302 1307                  if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1303 1308                          ZFS_EXIT(zfsvfs);
1304 1309                          return (error);
1305 1310                  }
1306 1311  
1307 1312                  /*
1308 1313                   * Do we have permission to get into attribute directory?
1309 1314                   */
1310 1315  
1311 1316                  if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1312 1317                      B_FALSE, cr)) {
1313 1318                          VN_RELE(*vpp);
1314 1319                          *vpp = NULL;
1315 1320                  }
1316 1321  
1317 1322                  ZFS_EXIT(zfsvfs);
1318 1323                  return (error);
1319 1324          }
1320 1325  
1321 1326          if (dvp->v_type != VDIR) {
1322 1327                  ZFS_EXIT(zfsvfs);
1323 1328                  return (SET_ERROR(ENOTDIR));
1324 1329          }
1325 1330  
1326 1331          /*
1327 1332           * Check accessibility of directory.
1328 1333           */
1329 1334  
1330 1335          if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1331 1336                  ZFS_EXIT(zfsvfs);
1332 1337                  return (error);
1333 1338          }
1334 1339  
1335 1340          if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1336 1341              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1337 1342                  ZFS_EXIT(zfsvfs);
1338 1343                  return (SET_ERROR(EILSEQ));
1339 1344          }
1340 1345  
1341 1346          error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1342 1347          if (error == 0)
1343 1348                  error = specvp_check(vpp, cr);
1344 1349  
1345 1350          ZFS_EXIT(zfsvfs);
1346 1351          return (error);
1347 1352  }
1348 1353  
1349 1354  /*
1350 1355   * Attempt to create a new entry in a directory.  If the entry
1351 1356   * already exists, truncate the file if permissible, else return
1352 1357   * an error.  Return the vp of the created or trunc'd file.
1353 1358   *
1354 1359   *      IN:     dvp     - vnode of directory to put new file entry in.
1355 1360   *              name    - name of new file entry.
1356 1361   *              vap     - attributes of new file.
1357 1362   *              excl    - flag indicating exclusive or non-exclusive mode.
1358 1363   *              mode    - mode to open file with.
1359 1364   *              cr      - credentials of caller.
1360 1365   *              flag    - large file flag [UNUSED].
1361 1366   *              ct      - caller context
1362 1367   *              vsecp   - ACL to be set
1363 1368   *
1364 1369   *      OUT:    vpp     - vnode of created or trunc'd entry.
1365 1370   *
1366 1371   *      RETURN: 0 on success, error code on failure.
1367 1372   *
1368 1373   * Timestamps:
1369 1374   *      dvp - ctime|mtime updated if new entry created
1370 1375   *       vp - ctime|mtime always, atime if new
1371 1376   */
1372 1377  
1373 1378  /* ARGSUSED */
1374 1379  static int
1375 1380  zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1376 1381      int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1377 1382      vsecattr_t *vsecp)
1378 1383  {
1379 1384          znode_t         *zp, *dzp = VTOZ(dvp);
1380 1385          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1381 1386          zilog_t         *zilog;
1382 1387          objset_t        *os;
1383 1388          zfs_dirlock_t   *dl;
1384 1389          dmu_tx_t        *tx;
1385 1390          int             error;
1386 1391          ksid_t          *ksid;
1387 1392          uid_t           uid;
1388 1393          gid_t           gid = crgetgid(cr);
1389 1394          zfs_acl_ids_t   acl_ids;
1390 1395          boolean_t       fuid_dirtied;
1391 1396          boolean_t       have_acl = B_FALSE;
1392 1397          boolean_t       waited = B_FALSE;
1393 1398  
1394 1399          /*
1395 1400           * If we have an ephemeral id, ACL, or XVATTR then
1396 1401           * make sure file system is at proper version
1397 1402           */
1398 1403  
1399 1404          ksid = crgetsid(cr, KSID_OWNER);
1400 1405          if (ksid)
1401 1406                  uid = ksid_getid(ksid);
1402 1407          else
1403 1408                  uid = crgetuid(cr);
1404 1409  
1405 1410          if (zfsvfs->z_use_fuids == B_FALSE &&
1406 1411              (vsecp || (vap->va_mask & AT_XVATTR) ||
1407 1412              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1408 1413                  return (SET_ERROR(EINVAL));
1409 1414  
1410 1415          ZFS_ENTER(zfsvfs);
1411 1416          ZFS_VERIFY_ZP(dzp);
1412 1417          os = zfsvfs->z_os;
1413 1418          zilog = zfsvfs->z_log;
1414 1419  
1415 1420          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1416 1421              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1417 1422                  ZFS_EXIT(zfsvfs);
1418 1423                  return (SET_ERROR(EILSEQ));
1419 1424          }
1420 1425  
1421 1426          if (vap->va_mask & AT_XVATTR) {
1422 1427                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1423 1428                      crgetuid(cr), cr, vap->va_type)) != 0) {
1424 1429                          ZFS_EXIT(zfsvfs);
1425 1430                          return (error);
1426 1431                  }
1427 1432          }
1428 1433  top:
1429 1434          *vpp = NULL;
1430 1435  
1431 1436          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1432 1437                  vap->va_mode &= ~VSVTX;
1433 1438  
1434 1439          if (*name == '\0') {
1435 1440                  /*
1436 1441                   * Null component name refers to the directory itself.
1437 1442                   */
1438 1443                  VN_HOLD(dvp);
1439 1444                  zp = dzp;
1440 1445                  dl = NULL;
1441 1446                  error = 0;
1442 1447          } else {
1443 1448                  /* possible VN_HOLD(zp) */
1444 1449                  int zflg = 0;
1445 1450  
1446 1451                  if (flag & FIGNORECASE)
1447 1452                          zflg |= ZCILOOK;
1448 1453  
1449 1454                  error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1450 1455                      NULL, NULL);
1451 1456                  if (error) {
1452 1457                          if (have_acl)
1453 1458                                  zfs_acl_ids_free(&acl_ids);
1454 1459                          if (strcmp(name, "..") == 0)
1455 1460                                  error = SET_ERROR(EISDIR);
1456 1461                          ZFS_EXIT(zfsvfs);
1457 1462                          return (error);
1458 1463                  }
1459 1464          }
1460 1465  
1461 1466          if (zp == NULL) {
1462 1467                  uint64_t txtype;
1463 1468  
1464 1469                  /*
1465 1470                   * Create a new file object and update the directory
1466 1471                   * to reference it.
1467 1472                   */
1468 1473                  if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1469 1474                          if (have_acl)
1470 1475                                  zfs_acl_ids_free(&acl_ids);
1471 1476                          goto out;
1472 1477                  }
1473 1478  
1474 1479                  /*
1475 1480                   * We only support the creation of regular files in
1476 1481                   * extended attribute directories.
1477 1482                   */
1478 1483  
1479 1484                  if ((dzp->z_pflags & ZFS_XATTR) &&
1480 1485                      (vap->va_type != VREG)) {
1481 1486                          if (have_acl)
1482 1487                                  zfs_acl_ids_free(&acl_ids);
1483 1488                          error = SET_ERROR(EINVAL);
1484 1489                          goto out;
1485 1490                  }
1486 1491  
1487 1492                  if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1488 1493                      cr, vsecp, &acl_ids)) != 0)
1489 1494                          goto out;
1490 1495                  have_acl = B_TRUE;
1491 1496  
1492 1497                  if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1493 1498                          zfs_acl_ids_free(&acl_ids);
1494 1499                          error = SET_ERROR(EDQUOT);
1495 1500                          goto out;
1496 1501                  }
1497 1502  
1498 1503                  tx = dmu_tx_create(os);
1499 1504  
1500 1505                  dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1501 1506                      ZFS_SA_BASE_ATTR_SIZE);
1502 1507  
1503 1508                  fuid_dirtied = zfsvfs->z_fuid_dirty;
1504 1509                  if (fuid_dirtied)
1505 1510                          zfs_fuid_txhold(zfsvfs, tx);
1506 1511                  dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1507 1512                  dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1508 1513                  if (!zfsvfs->z_use_sa &&
1509 1514                      acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1510 1515                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1511 1516                              0, acl_ids.z_aclp->z_acl_bytes);
1512 1517                  }
1513 1518                  error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1514 1519                  if (error) {
1515 1520                          zfs_dirent_unlock(dl);
1516 1521                          if (error == ERESTART) {
1517 1522                                  waited = B_TRUE;
1518 1523                                  dmu_tx_wait(tx);
1519 1524                                  dmu_tx_abort(tx);
1520 1525                                  goto top;
1521 1526                          }
1522 1527                          zfs_acl_ids_free(&acl_ids);
1523 1528                          dmu_tx_abort(tx);
1524 1529                          ZFS_EXIT(zfsvfs);
1525 1530                          return (error);
1526 1531                  }
1527 1532                  zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1528 1533  
1529 1534                  if (fuid_dirtied)
1530 1535                          zfs_fuid_sync(zfsvfs, tx);
1531 1536  
1532 1537                  (void) zfs_link_create(dl, zp, tx, ZNEW);
1533 1538                  txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1534 1539                  if (flag & FIGNORECASE)
1535 1540                          txtype |= TX_CI;
1536 1541                  zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1537 1542                      vsecp, acl_ids.z_fuidp, vap);
1538 1543                  zfs_acl_ids_free(&acl_ids);
1539 1544                  dmu_tx_commit(tx);
1540 1545          } else {
1541 1546                  int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1542 1547  
1543 1548                  if (have_acl)
1544 1549                          zfs_acl_ids_free(&acl_ids);
1545 1550                  have_acl = B_FALSE;
1546 1551  
1547 1552                  /*
1548 1553                   * A directory entry already exists for this name.
1549 1554                   */
1550 1555                  /*
1551 1556                   * Can't truncate an existing file if in exclusive mode.
1552 1557                   */
1553 1558                  if (excl == EXCL) {
1554 1559                          error = SET_ERROR(EEXIST);
1555 1560                          goto out;
1556 1561                  }
1557 1562                  /*
1558 1563                   * Can't open a directory for writing.
1559 1564                   */
1560 1565                  if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1561 1566                          error = SET_ERROR(EISDIR);
1562 1567                          goto out;
1563 1568                  }
1564 1569                  /*
1565 1570                   * Verify requested access to file.
1566 1571                   */
1567 1572                  if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1568 1573                          goto out;
1569 1574                  }
1570 1575  
1571 1576                  mutex_enter(&dzp->z_lock);
1572 1577                  dzp->z_seq++;
1573 1578                  mutex_exit(&dzp->z_lock);
1574 1579  
1575 1580                  /*
1576 1581                   * Truncate regular files if requested.
1577 1582                   */
1578 1583                  if ((ZTOV(zp)->v_type == VREG) &&
1579 1584                      (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1580 1585                          /* we can't hold any locks when calling zfs_freesp() */
1581 1586                          zfs_dirent_unlock(dl);
1582 1587                          dl = NULL;
1583 1588                          error = zfs_freesp(zp, 0, 0, mode, TRUE);
1584 1589                          if (error == 0) {
1585 1590                                  vnevent_create(ZTOV(zp), ct);
1586 1591                          }
1587 1592                  }
1588 1593          }
1589 1594  out:
1590 1595  
1591 1596          if (dl)
1592 1597                  zfs_dirent_unlock(dl);
1593 1598  
1594 1599          if (error) {
1595 1600                  if (zp)
1596 1601                          VN_RELE(ZTOV(zp));
1597 1602          } else {
1598 1603                  *vpp = ZTOV(zp);
1599 1604                  error = specvp_check(vpp, cr);
1600 1605          }
1601 1606  
1602 1607          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1603 1608                  zil_commit(zilog, 0);
1604 1609  
1605 1610          ZFS_EXIT(zfsvfs);
1606 1611          return (error);
1607 1612  }
1608 1613  
1609 1614  /*
1610 1615   * Remove an entry from a directory.
1611 1616   *
1612 1617   *      IN:     dvp     - vnode of directory to remove entry from.
1613 1618   *              name    - name of entry to remove.
1614 1619   *              cr      - credentials of caller.
1615 1620   *              ct      - caller context
1616 1621   *              flags   - case flags
1617 1622   *
1618 1623   *      RETURN: 0 on success, error code on failure.
1619 1624   *
1620 1625   * Timestamps:
1621 1626   *      dvp - ctime|mtime
1622 1627   *       vp - ctime (if nlink > 0)
1623 1628   */
1624 1629  
1625 1630  uint64_t null_xattr = 0;
1626 1631  
1627 1632  /*ARGSUSED*/
1628 1633  static int
1629 1634  zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1630 1635      int flags)
1631 1636  {
1632 1637          znode_t         *zp, *dzp = VTOZ(dvp);
1633 1638          znode_t         *xzp;
1634 1639          vnode_t         *vp;
1635 1640          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1636 1641          zilog_t         *zilog;
1637 1642          uint64_t        acl_obj, xattr_obj;
1638 1643          uint64_t        xattr_obj_unlinked = 0;
1639 1644          uint64_t        obj = 0;
1640 1645          zfs_dirlock_t   *dl;
1641 1646          dmu_tx_t        *tx;
1642 1647          boolean_t       may_delete_now, delete_now = FALSE;
1643 1648          boolean_t       unlinked, toobig = FALSE;
1644 1649          uint64_t        txtype;
1645 1650          pathname_t      *realnmp = NULL;
1646 1651          pathname_t      realnm;
1647 1652          int             error;
1648 1653          int             zflg = ZEXISTS;
1649 1654          boolean_t       waited = B_FALSE;
1650 1655  
1651 1656          ZFS_ENTER(zfsvfs);
1652 1657          ZFS_VERIFY_ZP(dzp);
1653 1658          zilog = zfsvfs->z_log;
1654 1659  
1655 1660          if (flags & FIGNORECASE) {
1656 1661                  zflg |= ZCILOOK;
1657 1662                  pn_alloc(&realnm);
1658 1663                  realnmp = &realnm;
1659 1664          }
1660 1665  
1661 1666  top:
1662 1667          xattr_obj = 0;
1663 1668          xzp = NULL;
1664 1669          /*
1665 1670           * Attempt to lock directory; fail if entry doesn't exist.
1666 1671           */
1667 1672          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1668 1673              NULL, realnmp)) {
1669 1674                  if (realnmp)
1670 1675                          pn_free(realnmp);
1671 1676                  ZFS_EXIT(zfsvfs);
1672 1677                  return (error);
1673 1678          }
1674 1679  
1675 1680          vp = ZTOV(zp);
1676 1681  
1677 1682          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1678 1683                  goto out;
1679 1684          }
1680 1685  
1681 1686          /*
1682 1687           * Need to use rmdir for removing directories.
1683 1688           */
1684 1689          if (vp->v_type == VDIR) {
1685 1690                  error = SET_ERROR(EPERM);
1686 1691                  goto out;
1687 1692          }
1688 1693  
1689 1694          vnevent_remove(vp, dvp, name, ct);
1690 1695  
1691 1696          if (realnmp)
1692 1697                  dnlc_remove(dvp, realnmp->pn_buf);
1693 1698          else
1694 1699                  dnlc_remove(dvp, name);
1695 1700  
1696 1701          mutex_enter(&vp->v_lock);
1697 1702          may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1698 1703          mutex_exit(&vp->v_lock);
1699 1704  
1700 1705          /*
1701 1706           * We may delete the znode now, or we may put it in the unlinked set;
1702 1707           * it depends on whether we're the last link, and on whether there are
1703 1708           * other holds on the vnode.  So we dmu_tx_hold() the right things to
1704 1709           * allow for either case.
1705 1710           */
1706 1711          obj = zp->z_id;
1707 1712          tx = dmu_tx_create(zfsvfs->z_os);
1708 1713          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1709 1714          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1710 1715          zfs_sa_upgrade_txholds(tx, zp);
1711 1716          zfs_sa_upgrade_txholds(tx, dzp);
1712 1717          if (may_delete_now) {
1713 1718                  toobig =
1714 1719                      zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1715 1720                  /* if the file is too big, only hold_free a token amount */
1716 1721                  dmu_tx_hold_free(tx, zp->z_id, 0,
1717 1722                      (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1718 1723          }
1719 1724  
1720 1725          /* are there any extended attributes? */
1721 1726          error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1722 1727              &xattr_obj, sizeof (xattr_obj));
1723 1728          if (error == 0 && xattr_obj) {
1724 1729                  error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1725 1730                  ASSERT0(error);
1726 1731                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1727 1732                  dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1728 1733          }
1729 1734  
1730 1735          mutex_enter(&zp->z_lock);
1731 1736          if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1732 1737                  dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1733 1738          mutex_exit(&zp->z_lock);
1734 1739  
1735 1740          /* charge as an update -- would be nice not to charge at all */
1736 1741          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1737 1742  
1738 1743          /*
1739 1744           * Mark this transaction as typically resulting in a net free of space
1740 1745           */
1741 1746          dmu_tx_mark_netfree(tx);
1742 1747  
1743 1748          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1744 1749          if (error) {
1745 1750                  zfs_dirent_unlock(dl);
1746 1751                  VN_RELE(vp);
1747 1752                  if (xzp)
1748 1753                          VN_RELE(ZTOV(xzp));
1749 1754                  if (error == ERESTART) {
1750 1755                          waited = B_TRUE;
1751 1756                          dmu_tx_wait(tx);
1752 1757                          dmu_tx_abort(tx);
1753 1758                          goto top;
1754 1759                  }
1755 1760                  if (realnmp)
1756 1761                          pn_free(realnmp);
1757 1762                  dmu_tx_abort(tx);
1758 1763                  ZFS_EXIT(zfsvfs);
1759 1764                  return (error);
1760 1765          }
1761 1766  
1762 1767          /*
1763 1768           * Remove the directory entry.
1764 1769           */
1765 1770          error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1766 1771  
1767 1772          if (error) {
1768 1773                  dmu_tx_commit(tx);
1769 1774                  goto out;
1770 1775          }
1771 1776  
1772 1777          if (unlinked) {
1773 1778                  /*
1774 1779                   * Hold z_lock so that we can make sure that the ACL obj
1775 1780                   * hasn't changed.  Could have been deleted due to
1776 1781                   * zfs_sa_upgrade().
1777 1782                   */
1778 1783                  mutex_enter(&zp->z_lock);
1779 1784                  mutex_enter(&vp->v_lock);
1780 1785                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1781 1786                      &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1782 1787                  delete_now = may_delete_now && !toobig &&
1783 1788                      vp->v_count == 1 && !vn_has_cached_data(vp) &&
1784 1789                      xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1785 1790                      acl_obj;
1786 1791                  mutex_exit(&vp->v_lock);
1787 1792          }
1788 1793  
1789 1794          if (delete_now) {
1790 1795                  if (xattr_obj_unlinked) {
1791 1796                          ASSERT3U(xzp->z_links, ==, 2);
1792 1797                          mutex_enter(&xzp->z_lock);
1793 1798                          xzp->z_unlinked = 1;
1794 1799                          xzp->z_links = 0;
1795 1800                          error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1796 1801                              &xzp->z_links, sizeof (xzp->z_links), tx);
1797 1802                          ASSERT3U(error,  ==,  0);
1798 1803                          mutex_exit(&xzp->z_lock);
1799 1804                          zfs_unlinked_add(xzp, tx);
1800 1805  
1801 1806                          if (zp->z_is_sa)
1802 1807                                  error = sa_remove(zp->z_sa_hdl,
1803 1808                                      SA_ZPL_XATTR(zfsvfs), tx);
1804 1809                          else
1805 1810                                  error = sa_update(zp->z_sa_hdl,
1806 1811                                      SA_ZPL_XATTR(zfsvfs), &null_xattr,
1807 1812                                      sizeof (uint64_t), tx);
1808 1813                          ASSERT0(error);
1809 1814                  }
1810 1815                  mutex_enter(&vp->v_lock);
1811 1816                  vp->v_count--;
1812 1817                  ASSERT0(vp->v_count);
1813 1818                  mutex_exit(&vp->v_lock);
1814 1819                  mutex_exit(&zp->z_lock);
1815 1820                  zfs_znode_delete(zp, tx);
1816 1821          } else if (unlinked) {
1817 1822                  mutex_exit(&zp->z_lock);
1818 1823                  zfs_unlinked_add(zp, tx);
1819 1824          }
1820 1825  
1821 1826          txtype = TX_REMOVE;
1822 1827          if (flags & FIGNORECASE)
1823 1828                  txtype |= TX_CI;
1824 1829          zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1825 1830  
1826 1831          dmu_tx_commit(tx);
1827 1832  out:
1828 1833          if (realnmp)
1829 1834                  pn_free(realnmp);
1830 1835  
1831 1836          zfs_dirent_unlock(dl);
1832 1837  
1833 1838          if (!delete_now)
1834 1839                  VN_RELE(vp);
1835 1840          if (xzp)
1836 1841                  VN_RELE(ZTOV(xzp));
1837 1842  
1838 1843          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1839 1844                  zil_commit(zilog, 0);
1840 1845  
1841 1846          ZFS_EXIT(zfsvfs);
1842 1847          return (error);
1843 1848  }
1844 1849  
1845 1850  /*
1846 1851   * Create a new directory and insert it into dvp using the name
1847 1852   * provided.  Return a pointer to the inserted directory.
1848 1853   *
1849 1854   *      IN:     dvp     - vnode of directory to add subdir to.
1850 1855   *              dirname - name of new directory.
1851 1856   *              vap     - attributes of new directory.
1852 1857   *              cr      - credentials of caller.
1853 1858   *              ct      - caller context
1854 1859   *              flags   - case flags
1855 1860   *              vsecp   - ACL to be set
1856 1861   *
1857 1862   *      OUT:    vpp     - vnode of created directory.
1858 1863   *
1859 1864   *      RETURN: 0 on success, error code on failure.
1860 1865   *
1861 1866   * Timestamps:
1862 1867   *      dvp - ctime|mtime updated
1863 1868   *       vp - ctime|mtime|atime updated
1864 1869   */
1865 1870  /*ARGSUSED*/
1866 1871  static int
1867 1872  zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1868 1873      caller_context_t *ct, int flags, vsecattr_t *vsecp)
1869 1874  {
1870 1875          znode_t         *zp, *dzp = VTOZ(dvp);
1871 1876          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1872 1877          zilog_t         *zilog;
1873 1878          zfs_dirlock_t   *dl;
1874 1879          uint64_t        txtype;
1875 1880          dmu_tx_t        *tx;
1876 1881          int             error;
1877 1882          int             zf = ZNEW;
1878 1883          ksid_t          *ksid;
1879 1884          uid_t           uid;
1880 1885          gid_t           gid = crgetgid(cr);
1881 1886          zfs_acl_ids_t   acl_ids;
1882 1887          boolean_t       fuid_dirtied;
1883 1888          boolean_t       waited = B_FALSE;
1884 1889  
1885 1890          ASSERT(vap->va_type == VDIR);
1886 1891  
1887 1892          /*
1888 1893           * If we have an ephemeral id, ACL, or XVATTR then
1889 1894           * make sure file system is at proper version
1890 1895           */
1891 1896  
1892 1897          ksid = crgetsid(cr, KSID_OWNER);
1893 1898          if (ksid)
1894 1899                  uid = ksid_getid(ksid);
1895 1900          else
1896 1901                  uid = crgetuid(cr);
1897 1902          if (zfsvfs->z_use_fuids == B_FALSE &&
1898 1903              (vsecp || (vap->va_mask & AT_XVATTR) ||
1899 1904              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1900 1905                  return (SET_ERROR(EINVAL));
1901 1906  
1902 1907          ZFS_ENTER(zfsvfs);
1903 1908          ZFS_VERIFY_ZP(dzp);
1904 1909          zilog = zfsvfs->z_log;
1905 1910  
1906 1911          if (dzp->z_pflags & ZFS_XATTR) {
1907 1912                  ZFS_EXIT(zfsvfs);
1908 1913                  return (SET_ERROR(EINVAL));
1909 1914          }
1910 1915  
1911 1916          if (zfsvfs->z_utf8 && u8_validate(dirname,
1912 1917              strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1913 1918                  ZFS_EXIT(zfsvfs);
1914 1919                  return (SET_ERROR(EILSEQ));
1915 1920          }
1916 1921          if (flags & FIGNORECASE)
1917 1922                  zf |= ZCILOOK;
1918 1923  
1919 1924          if (vap->va_mask & AT_XVATTR) {
1920 1925                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1921 1926                      crgetuid(cr), cr, vap->va_type)) != 0) {
1922 1927                          ZFS_EXIT(zfsvfs);
1923 1928                          return (error);
1924 1929                  }
1925 1930          }
1926 1931  
1927 1932          if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1928 1933              vsecp, &acl_ids)) != 0) {
1929 1934                  ZFS_EXIT(zfsvfs);
1930 1935                  return (error);
1931 1936          }
1932 1937          /*
1933 1938           * First make sure the new directory doesn't exist.
1934 1939           *
1935 1940           * Existence is checked first to make sure we don't return
1936 1941           * EACCES instead of EEXIST which can cause some applications
1937 1942           * to fail.
1938 1943           */
1939 1944  top:
1940 1945          *vpp = NULL;
1941 1946  
1942 1947          if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1943 1948              NULL, NULL)) {
1944 1949                  zfs_acl_ids_free(&acl_ids);
1945 1950                  ZFS_EXIT(zfsvfs);
1946 1951                  return (error);
1947 1952          }
1948 1953  
1949 1954          if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1950 1955                  zfs_acl_ids_free(&acl_ids);
1951 1956                  zfs_dirent_unlock(dl);
1952 1957                  ZFS_EXIT(zfsvfs);
1953 1958                  return (error);
1954 1959          }
1955 1960  
1956 1961          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1957 1962                  zfs_acl_ids_free(&acl_ids);
1958 1963                  zfs_dirent_unlock(dl);
1959 1964                  ZFS_EXIT(zfsvfs);
1960 1965                  return (SET_ERROR(EDQUOT));
1961 1966          }
1962 1967  
1963 1968          /*
1964 1969           * Add a new entry to the directory.
1965 1970           */
1966 1971          tx = dmu_tx_create(zfsvfs->z_os);
1967 1972          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1968 1973          dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1969 1974          fuid_dirtied = zfsvfs->z_fuid_dirty;
1970 1975          if (fuid_dirtied)
1971 1976                  zfs_fuid_txhold(zfsvfs, tx);
1972 1977          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1973 1978                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1974 1979                      acl_ids.z_aclp->z_acl_bytes);
1975 1980          }
1976 1981  
1977 1982          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1978 1983              ZFS_SA_BASE_ATTR_SIZE);
1979 1984  
1980 1985          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1981 1986          if (error) {
1982 1987                  zfs_dirent_unlock(dl);
1983 1988                  if (error == ERESTART) {
1984 1989                          waited = B_TRUE;
1985 1990                          dmu_tx_wait(tx);
1986 1991                          dmu_tx_abort(tx);
1987 1992                          goto top;
1988 1993                  }
1989 1994                  zfs_acl_ids_free(&acl_ids);
1990 1995                  dmu_tx_abort(tx);
1991 1996                  ZFS_EXIT(zfsvfs);
1992 1997                  return (error);
1993 1998          }
1994 1999  
1995 2000          /*
1996 2001           * Create new node.
1997 2002           */
1998 2003          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1999 2004  
2000 2005          if (fuid_dirtied)
2001 2006                  zfs_fuid_sync(zfsvfs, tx);
2002 2007  
2003 2008          /*
2004 2009           * Now put new name in parent dir.
2005 2010           */
2006 2011          (void) zfs_link_create(dl, zp, tx, ZNEW);
2007 2012  
2008 2013          *vpp = ZTOV(zp);
2009 2014  
2010 2015          txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2011 2016          if (flags & FIGNORECASE)
2012 2017                  txtype |= TX_CI;
2013 2018          zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2014 2019              acl_ids.z_fuidp, vap);
2015 2020  
2016 2021          zfs_acl_ids_free(&acl_ids);
2017 2022  
2018 2023          dmu_tx_commit(tx);
2019 2024  
2020 2025          zfs_dirent_unlock(dl);
2021 2026  
2022 2027          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2023 2028                  zil_commit(zilog, 0);
2024 2029  
2025 2030          ZFS_EXIT(zfsvfs);
2026 2031          return (0);
2027 2032  }
2028 2033  
2029 2034  /*
2030 2035   * Remove a directory subdir entry.  If the current working
2031 2036   * directory is the same as the subdir to be removed, the
2032 2037   * remove will fail.
2033 2038   *
2034 2039   *      IN:     dvp     - vnode of directory to remove from.
2035 2040   *              name    - name of directory to be removed.
2036 2041   *              cwd     - vnode of current working directory.
2037 2042   *              cr      - credentials of caller.
2038 2043   *              ct      - caller context
2039 2044   *              flags   - case flags
2040 2045   *
2041 2046   *      RETURN: 0 on success, error code on failure.
2042 2047   *
2043 2048   * Timestamps:
2044 2049   *      dvp - ctime|mtime updated
2045 2050   */
2046 2051  /*ARGSUSED*/
2047 2052  static int
2048 2053  zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2049 2054      caller_context_t *ct, int flags)
2050 2055  {
2051 2056          znode_t         *dzp = VTOZ(dvp);
2052 2057          znode_t         *zp;
2053 2058          vnode_t         *vp;
2054 2059          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2055 2060          zilog_t         *zilog;
2056 2061          zfs_dirlock_t   *dl;
2057 2062          dmu_tx_t        *tx;
2058 2063          int             error;
2059 2064          int             zflg = ZEXISTS;
2060 2065          boolean_t       waited = B_FALSE;
2061 2066  
2062 2067          ZFS_ENTER(zfsvfs);
2063 2068          ZFS_VERIFY_ZP(dzp);
2064 2069          zilog = zfsvfs->z_log;
2065 2070  
2066 2071          if (flags & FIGNORECASE)
2067 2072                  zflg |= ZCILOOK;
2068 2073  top:
2069 2074          zp = NULL;
2070 2075  
2071 2076          /*
2072 2077           * Attempt to lock directory; fail if entry doesn't exist.
2073 2078           */
2074 2079          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2075 2080              NULL, NULL)) {
2076 2081                  ZFS_EXIT(zfsvfs);
2077 2082                  return (error);
2078 2083          }
2079 2084  
2080 2085          vp = ZTOV(zp);
2081 2086  
2082 2087          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2083 2088                  goto out;
2084 2089          }
2085 2090  
2086 2091          if (vp->v_type != VDIR) {
2087 2092                  error = SET_ERROR(ENOTDIR);
2088 2093                  goto out;
2089 2094          }
2090 2095  
2091 2096          if (vp == cwd) {
2092 2097                  error = SET_ERROR(EINVAL);
2093 2098                  goto out;
2094 2099          }
2095 2100  
2096 2101          vnevent_rmdir(vp, dvp, name, ct);
2097 2102  
2098 2103          /*
2099 2104           * Grab a lock on the directory to make sure that noone is
2100 2105           * trying to add (or lookup) entries while we are removing it.
2101 2106           */
2102 2107          rw_enter(&zp->z_name_lock, RW_WRITER);
2103 2108  
2104 2109          /*
2105 2110           * Grab a lock on the parent pointer to make sure we play well
2106 2111           * with the treewalk and directory rename code.
2107 2112           */
2108 2113          rw_enter(&zp->z_parent_lock, RW_WRITER);
2109 2114  
2110 2115          tx = dmu_tx_create(zfsvfs->z_os);
2111 2116          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2112 2117          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2113 2118          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2114 2119          zfs_sa_upgrade_txholds(tx, zp);
2115 2120          zfs_sa_upgrade_txholds(tx, dzp);
2116 2121          dmu_tx_mark_netfree(tx);
2117 2122          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2118 2123          if (error) {
2119 2124                  rw_exit(&zp->z_parent_lock);
2120 2125                  rw_exit(&zp->z_name_lock);
2121 2126                  zfs_dirent_unlock(dl);
2122 2127                  VN_RELE(vp);
2123 2128                  if (error == ERESTART) {
2124 2129                          waited = B_TRUE;
2125 2130                          dmu_tx_wait(tx);
2126 2131                          dmu_tx_abort(tx);
2127 2132                          goto top;
2128 2133                  }
2129 2134                  dmu_tx_abort(tx);
2130 2135                  ZFS_EXIT(zfsvfs);
2131 2136                  return (error);
2132 2137          }
2133 2138  
2134 2139          error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2135 2140  
2136 2141          if (error == 0) {
2137 2142                  uint64_t txtype = TX_RMDIR;
2138 2143                  if (flags & FIGNORECASE)
2139 2144                          txtype |= TX_CI;
2140 2145                  zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2141 2146          }
2142 2147  
2143 2148          dmu_tx_commit(tx);
2144 2149  
2145 2150          rw_exit(&zp->z_parent_lock);
2146 2151          rw_exit(&zp->z_name_lock);
2147 2152  out:
2148 2153          zfs_dirent_unlock(dl);
2149 2154  
2150 2155          VN_RELE(vp);
2151 2156  
2152 2157          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2153 2158                  zil_commit(zilog, 0);
2154 2159  
2155 2160          ZFS_EXIT(zfsvfs);
2156 2161          return (error);
2157 2162  }
2158 2163  
2159 2164  /*
2160 2165   * Read as many directory entries as will fit into the provided
2161 2166   * buffer from the given directory cursor position (specified in
2162 2167   * the uio structure).
2163 2168   *
2164 2169   *      IN:     vp      - vnode of directory to read.
2165 2170   *              uio     - structure supplying read location, range info,
2166 2171   *                        and return buffer.
2167 2172   *              cr      - credentials of caller.
2168 2173   *              ct      - caller context
2169 2174   *              flags   - case flags
2170 2175   *
2171 2176   *      OUT:    uio     - updated offset and range, buffer filled.
2172 2177   *              eofp    - set to true if end-of-file detected.
2173 2178   *
2174 2179   *      RETURN: 0 on success, error code on failure.
2175 2180   *
2176 2181   * Timestamps:
2177 2182   *      vp - atime updated
2178 2183   *
2179 2184   * Note that the low 4 bits of the cookie returned by zap is always zero.
2180 2185   * This allows us to use the low range for "special" directory entries:
2181 2186   * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2182 2187   * we use the offset 2 for the '.zfs' directory.
2183 2188   */
2184 2189  /* ARGSUSED */
2185 2190  static int
2186 2191  zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2187 2192      caller_context_t *ct, int flags)
2188 2193  {
2189 2194          znode_t         *zp = VTOZ(vp);
2190 2195          iovec_t         *iovp;
2191 2196          edirent_t       *eodp;
2192 2197          dirent64_t      *odp;
2193 2198          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2194 2199          objset_t        *os;
2195 2200          caddr_t         outbuf;
2196 2201          size_t          bufsize;
2197 2202          zap_cursor_t    zc;
2198 2203          zap_attribute_t zap;
2199 2204          uint_t          bytes_wanted;
2200 2205          uint64_t        offset; /* must be unsigned; checks for < 1 */
2201 2206          uint64_t        parent;
2202 2207          int             local_eof;
2203 2208          int             outcount;
2204 2209          int             error;
2205 2210          uint8_t         prefetch;
2206 2211          boolean_t       check_sysattrs;
2207 2212  
2208 2213          ZFS_ENTER(zfsvfs);
2209 2214          ZFS_VERIFY_ZP(zp);
2210 2215  
2211 2216          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2212 2217              &parent, sizeof (parent))) != 0) {
2213 2218                  ZFS_EXIT(zfsvfs);
2214 2219                  return (error);
2215 2220          }
2216 2221  
2217 2222          /*
2218 2223           * If we are not given an eof variable,
2219 2224           * use a local one.
2220 2225           */
2221 2226          if (eofp == NULL)
2222 2227                  eofp = &local_eof;
2223 2228  
2224 2229          /*
2225 2230           * Check for valid iov_len.
2226 2231           */
2227 2232          if (uio->uio_iov->iov_len <= 0) {
2228 2233                  ZFS_EXIT(zfsvfs);
2229 2234                  return (SET_ERROR(EINVAL));
2230 2235          }
2231 2236  
2232 2237          /*
2233 2238           * Quit if directory has been removed (posix)
2234 2239           */
2235 2240          if ((*eofp = zp->z_unlinked) != 0) {
2236 2241                  ZFS_EXIT(zfsvfs);
2237 2242                  return (0);
2238 2243          }
2239 2244  
2240 2245          error = 0;
2241 2246          os = zfsvfs->z_os;
2242 2247          offset = uio->uio_loffset;
2243 2248          prefetch = zp->z_zn_prefetch;
2244 2249  
2245 2250          /*
2246 2251           * Initialize the iterator cursor.
2247 2252           */
2248 2253          if (offset <= 3) {
2249 2254                  /*
2250 2255                   * Start iteration from the beginning of the directory.
2251 2256                   */
2252 2257                  zap_cursor_init(&zc, os, zp->z_id);
2253 2258          } else {
2254 2259                  /*
2255 2260                   * The offset is a serialized cursor.
2256 2261                   */
2257 2262                  zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2258 2263          }
2259 2264  
2260 2265          /*
2261 2266           * Get space to change directory entries into fs independent format.
2262 2267           */
2263 2268          iovp = uio->uio_iov;
2264 2269          bytes_wanted = iovp->iov_len;
2265 2270          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2266 2271                  bufsize = bytes_wanted;
2267 2272                  outbuf = kmem_alloc(bufsize, KM_SLEEP);
2268 2273                  odp = (struct dirent64 *)outbuf;
2269 2274          } else {
2270 2275                  bufsize = bytes_wanted;
2271 2276                  outbuf = NULL;
2272 2277                  odp = (struct dirent64 *)iovp->iov_base;
2273 2278          }
2274 2279          eodp = (struct edirent *)odp;
2275 2280  
2276 2281          /*
2277 2282           * If this VFS supports the system attribute view interface; and
2278 2283           * we're looking at an extended attribute directory; and we care
2279 2284           * about normalization conflicts on this vfs; then we must check
2280 2285           * for normalization conflicts with the sysattr name space.
2281 2286           */
2282 2287          check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2283 2288              (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2284 2289              (flags & V_RDDIR_ENTFLAGS);
2285 2290  
2286 2291          /*
2287 2292           * Transform to file-system independent format
2288 2293           */
2289 2294          outcount = 0;
2290 2295          while (outcount < bytes_wanted) {
2291 2296                  ino64_t objnum;
2292 2297                  ushort_t reclen;
2293 2298                  off64_t *next = NULL;
2294 2299  
2295 2300                  /*
2296 2301                   * Special case `.', `..', and `.zfs'.
2297 2302                   */
2298 2303                  if (offset == 0) {
2299 2304                          (void) strcpy(zap.za_name, ".");
2300 2305                          zap.za_normalization_conflict = 0;
2301 2306                          objnum = zp->z_id;
2302 2307                  } else if (offset == 1) {
2303 2308                          (void) strcpy(zap.za_name, "..");
2304 2309                          zap.za_normalization_conflict = 0;
2305 2310                          objnum = parent;
2306 2311                  } else if (offset == 2 && zfs_show_ctldir(zp)) {
2307 2312                          (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2308 2313                          zap.za_normalization_conflict = 0;
2309 2314                          objnum = ZFSCTL_INO_ROOT;
2310 2315                  } else {
2311 2316                          /*
2312 2317                           * Grab next entry.
2313 2318                           */
2314 2319                          if (error = zap_cursor_retrieve(&zc, &zap)) {
2315 2320                                  if ((*eofp = (error == ENOENT)) != 0)
2316 2321                                          break;
2317 2322                                  else
2318 2323                                          goto update;
2319 2324                          }
2320 2325  
2321 2326                          if (zap.za_integer_length != 8 ||
2322 2327                              zap.za_num_integers != 1) {
2323 2328                                  cmn_err(CE_WARN, "zap_readdir: bad directory "
2324 2329                                      "entry, obj = %lld, offset = %lld\n",
2325 2330                                      (u_longlong_t)zp->z_id,
2326 2331                                      (u_longlong_t)offset);
2327 2332                                  error = SET_ERROR(ENXIO);
2328 2333                                  goto update;
2329 2334                          }
2330 2335  
2331 2336                          objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2332 2337                          /*
2333 2338                           * MacOS X can extract the object type here such as:
2334 2339                           * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2335 2340                           */
2336 2341  
2337 2342                          if (check_sysattrs && !zap.za_normalization_conflict) {
2338 2343                                  zap.za_normalization_conflict =
2339 2344                                      xattr_sysattr_casechk(zap.za_name);
2340 2345                          }
2341 2346                  }
2342 2347  
2343 2348                  if (flags & V_RDDIR_ACCFILTER) {
2344 2349                          /*
2345 2350                           * If we have no access at all, don't include
2346 2351                           * this entry in the returned information
2347 2352                           */
2348 2353                          znode_t *ezp;
2349 2354                          if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2350 2355                                  goto skip_entry;
2351 2356                          if (!zfs_has_access(ezp, cr)) {
2352 2357                                  VN_RELE(ZTOV(ezp));
2353 2358                                  goto skip_entry;
2354 2359                          }
2355 2360                          VN_RELE(ZTOV(ezp));
2356 2361                  }
2357 2362  
2358 2363                  if (flags & V_RDDIR_ENTFLAGS)
2359 2364                          reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2360 2365                  else
2361 2366                          reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2362 2367  
2363 2368                  /*
2364 2369                   * Will this entry fit in the buffer?
2365 2370                   */
2366 2371                  if (outcount + reclen > bufsize) {
2367 2372                          /*
2368 2373                           * Did we manage to fit anything in the buffer?
2369 2374                           */
2370 2375                          if (!outcount) {
2371 2376                                  error = SET_ERROR(EINVAL);
2372 2377                                  goto update;
2373 2378                          }
2374 2379                          break;
2375 2380                  }
2376 2381                  if (flags & V_RDDIR_ENTFLAGS) {
2377 2382                          /*
2378 2383                           * Add extended flag entry:
2379 2384                           */
2380 2385                          eodp->ed_ino = objnum;
2381 2386                          eodp->ed_reclen = reclen;
2382 2387                          /* NOTE: ed_off is the offset for the *next* entry */
2383 2388                          next = &(eodp->ed_off);
2384 2389                          eodp->ed_eflags = zap.za_normalization_conflict ?
2385 2390                              ED_CASE_CONFLICT : 0;
2386 2391                          (void) strncpy(eodp->ed_name, zap.za_name,
2387 2392                              EDIRENT_NAMELEN(reclen));
2388 2393                          eodp = (edirent_t *)((intptr_t)eodp + reclen);
2389 2394                  } else {
2390 2395                          /*
2391 2396                           * Add normal entry:
2392 2397                           */
2393 2398                          odp->d_ino = objnum;
2394 2399                          odp->d_reclen = reclen;
2395 2400                          /* NOTE: d_off is the offset for the *next* entry */
2396 2401                          next = &(odp->d_off);
2397 2402                          (void) strncpy(odp->d_name, zap.za_name,
2398 2403                              DIRENT64_NAMELEN(reclen));
2399 2404                          odp = (dirent64_t *)((intptr_t)odp + reclen);
2400 2405                  }
2401 2406                  outcount += reclen;
2402 2407  
2403 2408                  ASSERT(outcount <= bufsize);
2404 2409  
2405 2410                  /* Prefetch znode */
2406 2411                  if (prefetch)
2407 2412                          dmu_prefetch(os, objnum, 0, 0, 0,
2408 2413                              ZIO_PRIORITY_SYNC_READ);
2409 2414  
2410 2415          skip_entry:
2411 2416                  /*
2412 2417                   * Move to the next entry, fill in the previous offset.
2413 2418                   */
2414 2419                  if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2415 2420                          zap_cursor_advance(&zc);
2416 2421                          offset = zap_cursor_serialize(&zc);
2417 2422                  } else {
2418 2423                          offset += 1;
2419 2424                  }
2420 2425                  if (next)
2421 2426                          *next = offset;
2422 2427          }
2423 2428          zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2424 2429  
2425 2430          if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2426 2431                  iovp->iov_base += outcount;
2427 2432                  iovp->iov_len -= outcount;
2428 2433                  uio->uio_resid -= outcount;
2429 2434          } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2430 2435                  /*
2431 2436                   * Reset the pointer.
2432 2437                   */
2433 2438                  offset = uio->uio_loffset;
2434 2439          }
2435 2440  
2436 2441  update:
2437 2442          zap_cursor_fini(&zc);
2438 2443          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2439 2444                  kmem_free(outbuf, bufsize);
2440 2445  
2441 2446          if (error == ENOENT)
2442 2447                  error = 0;
2443 2448  
2444 2449          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2445 2450  
2446 2451          uio->uio_loffset = offset;
2447 2452          ZFS_EXIT(zfsvfs);
2448 2453          return (error);
2449 2454  }
2450 2455  
2451 2456  ulong_t zfs_fsync_sync_cnt = 4;
2452 2457  
2453 2458  static int
2454 2459  zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2455 2460  {
2456 2461          znode_t *zp = VTOZ(vp);
2457 2462          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2458 2463  
2459 2464          /*
2460 2465           * Regardless of whether this is required for standards conformance,
2461 2466           * this is the logical behavior when fsync() is called on a file with
2462 2467           * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2463 2468           * going to be pushed out as part of the zil_commit().
2464 2469           */
2465 2470          if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2466 2471              (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2467 2472                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2468 2473  
2469 2474          (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2470 2475  
2471 2476          if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2472 2477                  ZFS_ENTER(zfsvfs);
2473 2478                  ZFS_VERIFY_ZP(zp);
2474 2479                  zil_commit(zfsvfs->z_log, zp->z_id);
2475 2480                  ZFS_EXIT(zfsvfs);
2476 2481          }
2477 2482          return (0);
2478 2483  }
2479 2484  
2480 2485  
2481 2486  /*
2482 2487   * Get the requested file attributes and place them in the provided
2483 2488   * vattr structure.
2484 2489   *
2485 2490   *      IN:     vp      - vnode of file.
2486 2491   *              vap     - va_mask identifies requested attributes.
2487 2492   *                        If AT_XVATTR set, then optional attrs are requested
2488 2493   *              flags   - ATTR_NOACLCHECK (CIFS server context)
2489 2494   *              cr      - credentials of caller.
2490 2495   *              ct      - caller context
2491 2496   *
2492 2497   *      OUT:    vap     - attribute values.
2493 2498   *
2494 2499   *      RETURN: 0 (always succeeds).
2495 2500   */
2496 2501  /* ARGSUSED */
2497 2502  static int
2498 2503  zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2499 2504      caller_context_t *ct)
2500 2505  {
2501 2506          znode_t *zp = VTOZ(vp);
2502 2507          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2503 2508          int     error = 0;
2504 2509          uint64_t links;
2505 2510          uint64_t mtime[2], ctime[2];
2506 2511          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2507 2512          xoptattr_t *xoap = NULL;
2508 2513          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2509 2514          sa_bulk_attr_t bulk[2];
2510 2515          int count = 0;
2511 2516  
2512 2517          ZFS_ENTER(zfsvfs);
2513 2518          ZFS_VERIFY_ZP(zp);
2514 2519  
2515 2520          zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2516 2521  
2517 2522          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2518 2523          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2519 2524  
2520 2525          if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2521 2526                  ZFS_EXIT(zfsvfs);
2522 2527                  return (error);
2523 2528          }
2524 2529  
2525 2530          /*
2526 2531           * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2527 2532           * Also, if we are the owner don't bother, since owner should
2528 2533           * always be allowed to read basic attributes of file.
2529 2534           */
2530 2535          if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2531 2536              (vap->va_uid != crgetuid(cr))) {
2532 2537                  if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2533 2538                      skipaclchk, cr)) {
2534 2539                          ZFS_EXIT(zfsvfs);
2535 2540                          return (error);
2536 2541                  }
2537 2542          }
2538 2543  
2539 2544          /*
2540 2545           * Return all attributes.  It's cheaper to provide the answer
2541 2546           * than to determine whether we were asked the question.
2542 2547           */
2543 2548  
2544 2549          mutex_enter(&zp->z_lock);
2545 2550          vap->va_type = vp->v_type;
2546 2551          vap->va_mode = zp->z_mode & MODEMASK;
2547 2552          vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2548 2553          vap->va_nodeid = zp->z_id;
2549 2554          if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2550 2555                  links = zp->z_links + 1;
2551 2556          else
2552 2557                  links = zp->z_links;
2553 2558          vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2554 2559          vap->va_size = zp->z_size;
2555 2560          vap->va_rdev = vp->v_rdev;
2556 2561          vap->va_seq = zp->z_seq;
2557 2562  
2558 2563          /*
2559 2564           * Add in any requested optional attributes and the create time.
2560 2565           * Also set the corresponding bits in the returned attribute bitmap.
2561 2566           */
2562 2567          if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2563 2568                  if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2564 2569                          xoap->xoa_archive =
2565 2570                              ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2566 2571                          XVA_SET_RTN(xvap, XAT_ARCHIVE);
2567 2572                  }
2568 2573  
2569 2574                  if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2570 2575                          xoap->xoa_readonly =
2571 2576                              ((zp->z_pflags & ZFS_READONLY) != 0);
2572 2577                          XVA_SET_RTN(xvap, XAT_READONLY);
2573 2578                  }
2574 2579  
2575 2580                  if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2576 2581                          xoap->xoa_system =
2577 2582                              ((zp->z_pflags & ZFS_SYSTEM) != 0);
2578 2583                          XVA_SET_RTN(xvap, XAT_SYSTEM);
2579 2584                  }
2580 2585  
2581 2586                  if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2582 2587                          xoap->xoa_hidden =
2583 2588                              ((zp->z_pflags & ZFS_HIDDEN) != 0);
2584 2589                          XVA_SET_RTN(xvap, XAT_HIDDEN);
2585 2590                  }
2586 2591  
2587 2592                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2588 2593                          xoap->xoa_nounlink =
2589 2594                              ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2590 2595                          XVA_SET_RTN(xvap, XAT_NOUNLINK);
2591 2596                  }
2592 2597  
2593 2598                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2594 2599                          xoap->xoa_immutable =
2595 2600                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2596 2601                          XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2597 2602                  }
2598 2603  
2599 2604                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2600 2605                          xoap->xoa_appendonly =
2601 2606                              ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2602 2607                          XVA_SET_RTN(xvap, XAT_APPENDONLY);
2603 2608                  }
2604 2609  
2605 2610                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2606 2611                          xoap->xoa_nodump =
2607 2612                              ((zp->z_pflags & ZFS_NODUMP) != 0);
2608 2613                          XVA_SET_RTN(xvap, XAT_NODUMP);
2609 2614                  }
2610 2615  
2611 2616                  if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2612 2617                          xoap->xoa_opaque =
2613 2618                              ((zp->z_pflags & ZFS_OPAQUE) != 0);
2614 2619                          XVA_SET_RTN(xvap, XAT_OPAQUE);
2615 2620                  }
2616 2621  
2617 2622                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2618 2623                          xoap->xoa_av_quarantined =
2619 2624                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2620 2625                          XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2621 2626                  }
2622 2627  
2623 2628                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2624 2629                          xoap->xoa_av_modified =
2625 2630                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2626 2631                          XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2627 2632                  }
2628 2633  
2629 2634                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2630 2635                      vp->v_type == VREG) {
2631 2636                          zfs_sa_get_scanstamp(zp, xvap);
2632 2637                  }
2633 2638  
2634 2639                  if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2635 2640                          uint64_t times[2];
2636 2641  
2637 2642                          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2638 2643                              times, sizeof (times));
2639 2644                          ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2640 2645                          XVA_SET_RTN(xvap, XAT_CREATETIME);
2641 2646                  }
2642 2647  
2643 2648                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2644 2649                          xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2645 2650                          XVA_SET_RTN(xvap, XAT_REPARSE);
2646 2651                  }
2647 2652                  if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2648 2653                          xoap->xoa_generation = zp->z_gen;
2649 2654                          XVA_SET_RTN(xvap, XAT_GEN);
2650 2655                  }
2651 2656  
2652 2657                  if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2653 2658                          xoap->xoa_offline =
2654 2659                              ((zp->z_pflags & ZFS_OFFLINE) != 0);
2655 2660                          XVA_SET_RTN(xvap, XAT_OFFLINE);
2656 2661                  }
2657 2662  
2658 2663                  if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2659 2664                          xoap->xoa_sparse =
2660 2665                              ((zp->z_pflags & ZFS_SPARSE) != 0);
2661 2666                          XVA_SET_RTN(xvap, XAT_SPARSE);
2662 2667                  }
2663 2668          }
2664 2669  
2665 2670          ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2666 2671          ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2667 2672          ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2668 2673  
2669 2674          mutex_exit(&zp->z_lock);
2670 2675  
2671 2676          sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2672 2677  
2673 2678          if (zp->z_blksz == 0) {
2674 2679                  /*
2675 2680                   * Block size hasn't been set; suggest maximal I/O transfers.
2676 2681                   */
2677 2682                  vap->va_blksize = zfsvfs->z_max_blksz;
2678 2683          }
2679 2684  
2680 2685          ZFS_EXIT(zfsvfs);
2681 2686          return (0);
2682 2687  }
2683 2688  
2684 2689  /*
2685 2690   * Set the file attributes to the values contained in the
2686 2691   * vattr structure.
2687 2692   *
2688 2693   *      IN:     vp      - vnode of file to be modified.
2689 2694   *              vap     - new attribute values.
2690 2695   *                        If AT_XVATTR set, then optional attrs are being set
2691 2696   *              flags   - ATTR_UTIME set if non-default time values provided.
2692 2697   *                      - ATTR_NOACLCHECK (CIFS context only).
2693 2698   *              cr      - credentials of caller.
2694 2699   *              ct      - caller context
2695 2700   *
2696 2701   *      RETURN: 0 on success, error code on failure.
2697 2702   *
2698 2703   * Timestamps:
2699 2704   *      vp - ctime updated, mtime updated if size changed.
2700 2705   */
2701 2706  /* ARGSUSED */
2702 2707  static int
2703 2708  zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2704 2709      caller_context_t *ct)
2705 2710  {
2706 2711          znode_t         *zp = VTOZ(vp);
2707 2712          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2708 2713          zilog_t         *zilog;
2709 2714          dmu_tx_t        *tx;
2710 2715          vattr_t         oldva;
2711 2716          xvattr_t        tmpxvattr;
2712 2717          uint_t          mask = vap->va_mask;
2713 2718          uint_t          saved_mask = 0;
2714 2719          int             trim_mask = 0;
2715 2720          uint64_t        new_mode;
2716 2721          uint64_t        new_uid, new_gid;
2717 2722          uint64_t        xattr_obj;
2718 2723          uint64_t        mtime[2], ctime[2];
2719 2724          znode_t         *attrzp;
2720 2725          int             need_policy = FALSE;
2721 2726          int             err, err2;
2722 2727          zfs_fuid_info_t *fuidp = NULL;
2723 2728          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2724 2729          xoptattr_t      *xoap;
2725 2730          zfs_acl_t       *aclp;
2726 2731          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2727 2732          boolean_t       fuid_dirtied = B_FALSE;
2728 2733          sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2729 2734          int             count = 0, xattr_count = 0;
2730 2735  
2731 2736          if (mask == 0)
2732 2737                  return (0);
2733 2738  
2734 2739          if (mask & AT_NOSET)
2735 2740                  return (SET_ERROR(EINVAL));
2736 2741  
2737 2742          ZFS_ENTER(zfsvfs);
2738 2743          ZFS_VERIFY_ZP(zp);
2739 2744  
2740 2745          zilog = zfsvfs->z_log;
2741 2746  
2742 2747          /*
2743 2748           * Make sure that if we have ephemeral uid/gid or xvattr specified
2744 2749           * that file system is at proper version level
2745 2750           */
2746 2751  
2747 2752          if (zfsvfs->z_use_fuids == B_FALSE &&
2748 2753              (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2749 2754              ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2750 2755              (mask & AT_XVATTR))) {
2751 2756                  ZFS_EXIT(zfsvfs);
2752 2757                  return (SET_ERROR(EINVAL));
2753 2758          }
2754 2759  
2755 2760          if (mask & AT_SIZE && vp->v_type == VDIR) {
2756 2761                  ZFS_EXIT(zfsvfs);
2757 2762                  return (SET_ERROR(EISDIR));
2758 2763          }
2759 2764  
2760 2765          if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2761 2766                  ZFS_EXIT(zfsvfs);
2762 2767                  return (SET_ERROR(EINVAL));
2763 2768          }
2764 2769  
2765 2770          /*
2766 2771           * If this is an xvattr_t, then get a pointer to the structure of
2767 2772           * optional attributes.  If this is NULL, then we have a vattr_t.
2768 2773           */
2769 2774          xoap = xva_getxoptattr(xvap);
2770 2775  
2771 2776          xva_init(&tmpxvattr);
2772 2777  
2773 2778          /*
2774 2779           * Immutable files can only alter immutable bit and atime
2775 2780           */
2776 2781          if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2777 2782              ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2778 2783              ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2779 2784                  ZFS_EXIT(zfsvfs);
2780 2785                  return (SET_ERROR(EPERM));
2781 2786          }
2782 2787  
2783 2788          if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2784 2789                  ZFS_EXIT(zfsvfs);
2785 2790                  return (SET_ERROR(EPERM));
2786 2791          }
2787 2792  
2788 2793          /*
2789 2794           * Verify timestamps doesn't overflow 32 bits.
2790 2795           * ZFS can handle large timestamps, but 32bit syscalls can't
2791 2796           * handle times greater than 2039.  This check should be removed
2792 2797           * once large timestamps are fully supported.
2793 2798           */
2794 2799          if (mask & (AT_ATIME | AT_MTIME)) {
2795 2800                  if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2796 2801                      ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2797 2802                          ZFS_EXIT(zfsvfs);
2798 2803                          return (SET_ERROR(EOVERFLOW));
2799 2804                  }
2800 2805          }
2801 2806  
2802 2807  top:
2803 2808          attrzp = NULL;
2804 2809          aclp = NULL;
2805 2810  
2806 2811          /* Can this be moved to before the top label? */
2807 2812          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2808 2813                  ZFS_EXIT(zfsvfs);
2809 2814                  return (SET_ERROR(EROFS));
2810 2815          }
2811 2816  
2812 2817          /*
2813 2818           * First validate permissions
2814 2819           */
2815 2820  
2816 2821          if (mask & AT_SIZE) {
2817 2822                  err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2818 2823                  if (err) {
2819 2824                          ZFS_EXIT(zfsvfs);
2820 2825                          return (err);
2821 2826                  }
2822 2827                  /*
2823 2828                   * XXX - Note, we are not providing any open
2824 2829                   * mode flags here (like FNDELAY), so we may
  
    | 
      ↓ open down ↓ | 
    1828 lines elided | 
    
      ↑ open up ↑ | 
  
2825 2830                   * block if there are locks present... this
2826 2831                   * should be addressed in openat().
2827 2832                   */
2828 2833                  /* XXX - would it be OK to generate a log record here? */
2829 2834                  err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2830 2835                  if (err) {
2831 2836                          ZFS_EXIT(zfsvfs);
2832 2837                          return (err);
2833 2838                  }
2834 2839  
2835      -                if (vap->va_size == 0)
     2840 +                if (vap->va_size == 0) {
2836 2841                          vnevent_truncate(ZTOV(zp), ct);
     2842 +                } else {
     2843 +                        vnevent_resize(ZTOV(zp), ct);
     2844 +                }
2837 2845          }
2838 2846  
2839 2847          if (mask & (AT_ATIME|AT_MTIME) ||
2840 2848              ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2841 2849              XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2842 2850              XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2843 2851              XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2844 2852              XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2845 2853              XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2846 2854              XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2847 2855                  need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2848 2856                      skipaclchk, cr);
2849 2857          }
2850 2858  
2851 2859          if (mask & (AT_UID|AT_GID)) {
2852 2860                  int     idmask = (mask & (AT_UID|AT_GID));
2853 2861                  int     take_owner;
2854 2862                  int     take_group;
2855 2863  
2856 2864                  /*
2857 2865                   * NOTE: even if a new mode is being set,
2858 2866                   * we may clear S_ISUID/S_ISGID bits.
2859 2867                   */
2860 2868  
2861 2869                  if (!(mask & AT_MODE))
2862 2870                          vap->va_mode = zp->z_mode;
2863 2871  
2864 2872                  /*
2865 2873                   * Take ownership or chgrp to group we are a member of
2866 2874                   */
2867 2875  
2868 2876                  take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2869 2877                  take_group = (mask & AT_GID) &&
2870 2878                      zfs_groupmember(zfsvfs, vap->va_gid, cr);
2871 2879  
2872 2880                  /*
2873 2881                   * If both AT_UID and AT_GID are set then take_owner and
2874 2882                   * take_group must both be set in order to allow taking
2875 2883                   * ownership.
2876 2884                   *
2877 2885                   * Otherwise, send the check through secpolicy_vnode_setattr()
2878 2886                   *
2879 2887                   */
2880 2888  
2881 2889                  if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2882 2890                      ((idmask == AT_UID) && take_owner) ||
2883 2891                      ((idmask == AT_GID) && take_group)) {
2884 2892                          if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2885 2893                              skipaclchk, cr) == 0) {
2886 2894                                  /*
2887 2895                                   * Remove setuid/setgid for non-privileged users
2888 2896                                   */
2889 2897                                  secpolicy_setid_clear(vap, cr);
2890 2898                                  trim_mask = (mask & (AT_UID|AT_GID));
2891 2899                          } else {
2892 2900                                  need_policy =  TRUE;
2893 2901                          }
2894 2902                  } else {
2895 2903                          need_policy =  TRUE;
2896 2904                  }
2897 2905          }
2898 2906  
2899 2907          mutex_enter(&zp->z_lock);
2900 2908          oldva.va_mode = zp->z_mode;
2901 2909          zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2902 2910          if (mask & AT_XVATTR) {
2903 2911                  /*
2904 2912                   * Update xvattr mask to include only those attributes
2905 2913                   * that are actually changing.
2906 2914                   *
2907 2915                   * the bits will be restored prior to actually setting
2908 2916                   * the attributes so the caller thinks they were set.
2909 2917                   */
2910 2918                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2911 2919                          if (xoap->xoa_appendonly !=
2912 2920                              ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2913 2921                                  need_policy = TRUE;
2914 2922                          } else {
2915 2923                                  XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2916 2924                                  XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2917 2925                          }
2918 2926                  }
2919 2927  
2920 2928                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2921 2929                          if (xoap->xoa_nounlink !=
2922 2930                              ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2923 2931                                  need_policy = TRUE;
2924 2932                          } else {
2925 2933                                  XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2926 2934                                  XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2927 2935                          }
2928 2936                  }
2929 2937  
2930 2938                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2931 2939                          if (xoap->xoa_immutable !=
2932 2940                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2933 2941                                  need_policy = TRUE;
2934 2942                          } else {
2935 2943                                  XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2936 2944                                  XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2937 2945                          }
2938 2946                  }
2939 2947  
2940 2948                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2941 2949                          if (xoap->xoa_nodump !=
2942 2950                              ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2943 2951                                  need_policy = TRUE;
2944 2952                          } else {
2945 2953                                  XVA_CLR_REQ(xvap, XAT_NODUMP);
2946 2954                                  XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2947 2955                          }
2948 2956                  }
2949 2957  
2950 2958                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2951 2959                          if (xoap->xoa_av_modified !=
2952 2960                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2953 2961                                  need_policy = TRUE;
2954 2962                          } else {
2955 2963                                  XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2956 2964                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2957 2965                          }
2958 2966                  }
2959 2967  
2960 2968                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2961 2969                          if ((vp->v_type != VREG &&
2962 2970                              xoap->xoa_av_quarantined) ||
2963 2971                              xoap->xoa_av_quarantined !=
2964 2972                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2965 2973                                  need_policy = TRUE;
2966 2974                          } else {
2967 2975                                  XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2968 2976                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2969 2977                          }
2970 2978                  }
2971 2979  
2972 2980                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2973 2981                          mutex_exit(&zp->z_lock);
2974 2982                          ZFS_EXIT(zfsvfs);
2975 2983                          return (SET_ERROR(EPERM));
2976 2984                  }
2977 2985  
2978 2986                  if (need_policy == FALSE &&
2979 2987                      (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2980 2988                      XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2981 2989                          need_policy = TRUE;
2982 2990                  }
2983 2991          }
2984 2992  
2985 2993          mutex_exit(&zp->z_lock);
2986 2994  
2987 2995          if (mask & AT_MODE) {
2988 2996                  if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2989 2997                          err = secpolicy_setid_setsticky_clear(vp, vap,
2990 2998                              &oldva, cr);
2991 2999                          if (err) {
2992 3000                                  ZFS_EXIT(zfsvfs);
2993 3001                                  return (err);
2994 3002                          }
2995 3003                          trim_mask |= AT_MODE;
2996 3004                  } else {
2997 3005                          need_policy = TRUE;
2998 3006                  }
2999 3007          }
3000 3008  
3001 3009          if (need_policy) {
3002 3010                  /*
3003 3011                   * If trim_mask is set then take ownership
3004 3012                   * has been granted or write_acl is present and user
3005 3013                   * has the ability to modify mode.  In that case remove
3006 3014                   * UID|GID and or MODE from mask so that
3007 3015                   * secpolicy_vnode_setattr() doesn't revoke it.
3008 3016                   */
3009 3017  
3010 3018                  if (trim_mask) {
3011 3019                          saved_mask = vap->va_mask;
3012 3020                          vap->va_mask &= ~trim_mask;
3013 3021                  }
3014 3022                  err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3015 3023                      (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3016 3024                  if (err) {
3017 3025                          ZFS_EXIT(zfsvfs);
3018 3026                          return (err);
3019 3027                  }
3020 3028  
3021 3029                  if (trim_mask)
3022 3030                          vap->va_mask |= saved_mask;
3023 3031          }
3024 3032  
3025 3033          /*
3026 3034           * secpolicy_vnode_setattr, or take ownership may have
3027 3035           * changed va_mask
3028 3036           */
3029 3037          mask = vap->va_mask;
3030 3038  
3031 3039          if ((mask & (AT_UID | AT_GID))) {
3032 3040                  err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3033 3041                      &xattr_obj, sizeof (xattr_obj));
3034 3042  
3035 3043                  if (err == 0 && xattr_obj) {
3036 3044                          err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3037 3045                          if (err)
3038 3046                                  goto out2;
3039 3047                  }
3040 3048                  if (mask & AT_UID) {
3041 3049                          new_uid = zfs_fuid_create(zfsvfs,
3042 3050                              (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3043 3051                          if (new_uid != zp->z_uid &&
3044 3052                              zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3045 3053                                  if (attrzp)
3046 3054                                          VN_RELE(ZTOV(attrzp));
3047 3055                                  err = SET_ERROR(EDQUOT);
3048 3056                                  goto out2;
3049 3057                          }
3050 3058                  }
3051 3059  
3052 3060                  if (mask & AT_GID) {
3053 3061                          new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3054 3062                              cr, ZFS_GROUP, &fuidp);
3055 3063                          if (new_gid != zp->z_gid &&
3056 3064                              zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3057 3065                                  if (attrzp)
3058 3066                                          VN_RELE(ZTOV(attrzp));
3059 3067                                  err = SET_ERROR(EDQUOT);
3060 3068                                  goto out2;
3061 3069                          }
3062 3070                  }
3063 3071          }
3064 3072          tx = dmu_tx_create(zfsvfs->z_os);
3065 3073  
3066 3074          if (mask & AT_MODE) {
3067 3075                  uint64_t pmode = zp->z_mode;
3068 3076                  uint64_t acl_obj;
3069 3077                  new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3070 3078  
3071 3079                  if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3072 3080                      !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3073 3081                          err = SET_ERROR(EPERM);
3074 3082                          goto out;
3075 3083                  }
3076 3084  
3077 3085                  if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3078 3086                          goto out;
3079 3087  
3080 3088                  mutex_enter(&zp->z_lock);
3081 3089                  if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3082 3090                          /*
3083 3091                           * Are we upgrading ACL from old V0 format
3084 3092                           * to V1 format?
3085 3093                           */
3086 3094                          if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3087 3095                              zfs_znode_acl_version(zp) ==
3088 3096                              ZFS_ACL_VERSION_INITIAL) {
3089 3097                                  dmu_tx_hold_free(tx, acl_obj, 0,
3090 3098                                      DMU_OBJECT_END);
3091 3099                                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3092 3100                                      0, aclp->z_acl_bytes);
3093 3101                          } else {
3094 3102                                  dmu_tx_hold_write(tx, acl_obj, 0,
3095 3103                                      aclp->z_acl_bytes);
3096 3104                          }
3097 3105                  } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3098 3106                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3099 3107                              0, aclp->z_acl_bytes);
3100 3108                  }
3101 3109                  mutex_exit(&zp->z_lock);
3102 3110                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3103 3111          } else {
3104 3112                  if ((mask & AT_XVATTR) &&
3105 3113                      XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3106 3114                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3107 3115                  else
3108 3116                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3109 3117          }
3110 3118  
3111 3119          if (attrzp) {
3112 3120                  dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3113 3121          }
3114 3122  
3115 3123          fuid_dirtied = zfsvfs->z_fuid_dirty;
3116 3124          if (fuid_dirtied)
3117 3125                  zfs_fuid_txhold(zfsvfs, tx);
3118 3126  
3119 3127          zfs_sa_upgrade_txholds(tx, zp);
3120 3128  
3121 3129          err = dmu_tx_assign(tx, TXG_WAIT);
3122 3130          if (err)
3123 3131                  goto out;
3124 3132  
3125 3133          count = 0;
3126 3134          /*
3127 3135           * Set each attribute requested.
3128 3136           * We group settings according to the locks they need to acquire.
3129 3137           *
3130 3138           * Note: you cannot set ctime directly, although it will be
3131 3139           * updated as a side-effect of calling this function.
3132 3140           */
3133 3141  
3134 3142  
3135 3143          if (mask & (AT_UID|AT_GID|AT_MODE))
3136 3144                  mutex_enter(&zp->z_acl_lock);
3137 3145          mutex_enter(&zp->z_lock);
3138 3146  
3139 3147          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3140 3148              &zp->z_pflags, sizeof (zp->z_pflags));
3141 3149  
3142 3150          if (attrzp) {
3143 3151                  if (mask & (AT_UID|AT_GID|AT_MODE))
3144 3152                          mutex_enter(&attrzp->z_acl_lock);
3145 3153                  mutex_enter(&attrzp->z_lock);
3146 3154                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3147 3155                      SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3148 3156                      sizeof (attrzp->z_pflags));
3149 3157          }
3150 3158  
3151 3159          if (mask & (AT_UID|AT_GID)) {
3152 3160  
3153 3161                  if (mask & AT_UID) {
3154 3162                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3155 3163                              &new_uid, sizeof (new_uid));
3156 3164                          zp->z_uid = new_uid;
3157 3165                          if (attrzp) {
3158 3166                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3159 3167                                      SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3160 3168                                      sizeof (new_uid));
3161 3169                                  attrzp->z_uid = new_uid;
3162 3170                          }
3163 3171                  }
3164 3172  
3165 3173                  if (mask & AT_GID) {
3166 3174                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3167 3175                              NULL, &new_gid, sizeof (new_gid));
3168 3176                          zp->z_gid = new_gid;
3169 3177                          if (attrzp) {
3170 3178                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3171 3179                                      SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3172 3180                                      sizeof (new_gid));
3173 3181                                  attrzp->z_gid = new_gid;
3174 3182                          }
3175 3183                  }
3176 3184                  if (!(mask & AT_MODE)) {
3177 3185                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3178 3186                              NULL, &new_mode, sizeof (new_mode));
3179 3187                          new_mode = zp->z_mode;
3180 3188                  }
3181 3189                  err = zfs_acl_chown_setattr(zp);
3182 3190                  ASSERT(err == 0);
3183 3191                  if (attrzp) {
3184 3192                          err = zfs_acl_chown_setattr(attrzp);
3185 3193                          ASSERT(err == 0);
3186 3194                  }
3187 3195          }
3188 3196  
3189 3197          if (mask & AT_MODE) {
3190 3198                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3191 3199                      &new_mode, sizeof (new_mode));
3192 3200                  zp->z_mode = new_mode;
3193 3201                  ASSERT3U((uintptr_t)aclp, !=, NULL);
3194 3202                  err = zfs_aclset_common(zp, aclp, cr, tx);
3195 3203                  ASSERT0(err);
3196 3204                  if (zp->z_acl_cached)
3197 3205                          zfs_acl_free(zp->z_acl_cached);
3198 3206                  zp->z_acl_cached = aclp;
3199 3207                  aclp = NULL;
3200 3208          }
3201 3209  
3202 3210  
3203 3211          if (mask & AT_ATIME) {
3204 3212                  ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3205 3213                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3206 3214                      &zp->z_atime, sizeof (zp->z_atime));
3207 3215          }
3208 3216  
3209 3217          if (mask & AT_MTIME) {
3210 3218                  ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3211 3219                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3212 3220                      mtime, sizeof (mtime));
3213 3221          }
3214 3222  
3215 3223          /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3216 3224          if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3217 3225                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3218 3226                      NULL, mtime, sizeof (mtime));
3219 3227                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3220 3228                      &ctime, sizeof (ctime));
3221 3229                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3222 3230                      B_TRUE);
3223 3231          } else if (mask != 0) {
3224 3232                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3225 3233                      &ctime, sizeof (ctime));
3226 3234                  zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3227 3235                      B_TRUE);
3228 3236                  if (attrzp) {
3229 3237                          SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3230 3238                              SA_ZPL_CTIME(zfsvfs), NULL,
3231 3239                              &ctime, sizeof (ctime));
3232 3240                          zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3233 3241                              mtime, ctime, B_TRUE);
3234 3242                  }
3235 3243          }
3236 3244          /*
3237 3245           * Do this after setting timestamps to prevent timestamp
3238 3246           * update from toggling bit
3239 3247           */
3240 3248  
3241 3249          if (xoap && (mask & AT_XVATTR)) {
3242 3250  
3243 3251                  /*
3244 3252                   * restore trimmed off masks
3245 3253                   * so that return masks can be set for caller.
3246 3254                   */
3247 3255  
3248 3256                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3249 3257                          XVA_SET_REQ(xvap, XAT_APPENDONLY);
3250 3258                  }
3251 3259                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3252 3260                          XVA_SET_REQ(xvap, XAT_NOUNLINK);
3253 3261                  }
3254 3262                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3255 3263                          XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3256 3264                  }
3257 3265                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3258 3266                          XVA_SET_REQ(xvap, XAT_NODUMP);
3259 3267                  }
3260 3268                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3261 3269                          XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3262 3270                  }
3263 3271                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3264 3272                          XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3265 3273                  }
3266 3274  
3267 3275                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3268 3276                          ASSERT(vp->v_type == VREG);
3269 3277  
3270 3278                  zfs_xvattr_set(zp, xvap, tx);
3271 3279          }
3272 3280  
3273 3281          if (fuid_dirtied)
3274 3282                  zfs_fuid_sync(zfsvfs, tx);
3275 3283  
3276 3284          if (mask != 0)
3277 3285                  zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3278 3286  
3279 3287          mutex_exit(&zp->z_lock);
3280 3288          if (mask & (AT_UID|AT_GID|AT_MODE))
3281 3289                  mutex_exit(&zp->z_acl_lock);
3282 3290  
3283 3291          if (attrzp) {
3284 3292                  if (mask & (AT_UID|AT_GID|AT_MODE))
3285 3293                          mutex_exit(&attrzp->z_acl_lock);
3286 3294                  mutex_exit(&attrzp->z_lock);
3287 3295          }
3288 3296  out:
3289 3297          if (err == 0 && attrzp) {
3290 3298                  err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3291 3299                      xattr_count, tx);
3292 3300                  ASSERT(err2 == 0);
3293 3301          }
3294 3302  
3295 3303          if (attrzp)
3296 3304                  VN_RELE(ZTOV(attrzp));
3297 3305  
3298 3306          if (aclp)
3299 3307                  zfs_acl_free(aclp);
3300 3308  
3301 3309          if (fuidp) {
3302 3310                  zfs_fuid_info_free(fuidp);
3303 3311                  fuidp = NULL;
3304 3312          }
3305 3313  
3306 3314          if (err) {
3307 3315                  dmu_tx_abort(tx);
3308 3316                  if (err == ERESTART)
3309 3317                          goto top;
3310 3318          } else {
3311 3319                  err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3312 3320                  dmu_tx_commit(tx);
3313 3321          }
3314 3322  
3315 3323  out2:
3316 3324          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3317 3325                  zil_commit(zilog, 0);
3318 3326  
3319 3327          ZFS_EXIT(zfsvfs);
3320 3328          return (err);
3321 3329  }
3322 3330  
3323 3331  typedef struct zfs_zlock {
3324 3332          krwlock_t       *zl_rwlock;     /* lock we acquired */
3325 3333          znode_t         *zl_znode;      /* znode we held */
3326 3334          struct zfs_zlock *zl_next;      /* next in list */
3327 3335  } zfs_zlock_t;
3328 3336  
3329 3337  /*
3330 3338   * Drop locks and release vnodes that were held by zfs_rename_lock().
3331 3339   */
3332 3340  static void
3333 3341  zfs_rename_unlock(zfs_zlock_t **zlpp)
3334 3342  {
3335 3343          zfs_zlock_t *zl;
3336 3344  
3337 3345          while ((zl = *zlpp) != NULL) {
3338 3346                  if (zl->zl_znode != NULL)
3339 3347                          VN_RELE(ZTOV(zl->zl_znode));
3340 3348                  rw_exit(zl->zl_rwlock);
3341 3349                  *zlpp = zl->zl_next;
3342 3350                  kmem_free(zl, sizeof (*zl));
3343 3351          }
3344 3352  }
3345 3353  
3346 3354  /*
3347 3355   * Search back through the directory tree, using the ".." entries.
3348 3356   * Lock each directory in the chain to prevent concurrent renames.
3349 3357   * Fail any attempt to move a directory into one of its own descendants.
3350 3358   * XXX - z_parent_lock can overlap with map or grow locks
3351 3359   */
3352 3360  static int
3353 3361  zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3354 3362  {
3355 3363          zfs_zlock_t     *zl;
3356 3364          znode_t         *zp = tdzp;
3357 3365          uint64_t        rootid = zp->z_zfsvfs->z_root;
3358 3366          uint64_t        oidp = zp->z_id;
3359 3367          krwlock_t       *rwlp = &szp->z_parent_lock;
3360 3368          krw_t           rw = RW_WRITER;
3361 3369  
3362 3370          /*
3363 3371           * First pass write-locks szp and compares to zp->z_id.
3364 3372           * Later passes read-lock zp and compare to zp->z_parent.
3365 3373           */
3366 3374          do {
3367 3375                  if (!rw_tryenter(rwlp, rw)) {
3368 3376                          /*
3369 3377                           * Another thread is renaming in this path.
3370 3378                           * Note that if we are a WRITER, we don't have any
3371 3379                           * parent_locks held yet.
3372 3380                           */
3373 3381                          if (rw == RW_READER && zp->z_id > szp->z_id) {
3374 3382                                  /*
3375 3383                                   * Drop our locks and restart
3376 3384                                   */
3377 3385                                  zfs_rename_unlock(&zl);
3378 3386                                  *zlpp = NULL;
3379 3387                                  zp = tdzp;
3380 3388                                  oidp = zp->z_id;
3381 3389                                  rwlp = &szp->z_parent_lock;
3382 3390                                  rw = RW_WRITER;
3383 3391                                  continue;
3384 3392                          } else {
3385 3393                                  /*
3386 3394                                   * Wait for other thread to drop its locks
3387 3395                                   */
3388 3396                                  rw_enter(rwlp, rw);
3389 3397                          }
3390 3398                  }
3391 3399  
3392 3400                  zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3393 3401                  zl->zl_rwlock = rwlp;
3394 3402                  zl->zl_znode = NULL;
3395 3403                  zl->zl_next = *zlpp;
3396 3404                  *zlpp = zl;
3397 3405  
3398 3406                  if (oidp == szp->z_id)          /* We're a descendant of szp */
3399 3407                          return (SET_ERROR(EINVAL));
3400 3408  
3401 3409                  if (oidp == rootid)             /* We've hit the top */
3402 3410                          return (0);
3403 3411  
3404 3412                  if (rw == RW_READER) {          /* i.e. not the first pass */
3405 3413                          int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3406 3414                          if (error)
3407 3415                                  return (error);
3408 3416                          zl->zl_znode = zp;
3409 3417                  }
3410 3418                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3411 3419                      &oidp, sizeof (oidp));
3412 3420                  rwlp = &zp->z_parent_lock;
3413 3421                  rw = RW_READER;
3414 3422  
3415 3423          } while (zp->z_id != sdzp->z_id);
3416 3424  
3417 3425          return (0);
3418 3426  }
3419 3427  
3420 3428  /*
3421 3429   * Move an entry from the provided source directory to the target
3422 3430   * directory.  Change the entry name as indicated.
3423 3431   *
3424 3432   *      IN:     sdvp    - Source directory containing the "old entry".
3425 3433   *              snm     - Old entry name.
3426 3434   *              tdvp    - Target directory to contain the "new entry".
3427 3435   *              tnm     - New entry name.
3428 3436   *              cr      - credentials of caller.
3429 3437   *              ct      - caller context
3430 3438   *              flags   - case flags
3431 3439   *
3432 3440   *      RETURN: 0 on success, error code on failure.
3433 3441   *
3434 3442   * Timestamps:
3435 3443   *      sdvp,tdvp - ctime|mtime updated
3436 3444   */
3437 3445  /*ARGSUSED*/
3438 3446  static int
3439 3447  zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3440 3448      caller_context_t *ct, int flags)
3441 3449  {
3442 3450          znode_t         *tdzp, *szp, *tzp;
3443 3451          znode_t         *sdzp = VTOZ(sdvp);
3444 3452          zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3445 3453          zilog_t         *zilog;
3446 3454          vnode_t         *realvp;
3447 3455          zfs_dirlock_t   *sdl, *tdl;
3448 3456          dmu_tx_t        *tx;
3449 3457          zfs_zlock_t     *zl;
3450 3458          int             cmp, serr, terr;
3451 3459          int             error = 0, rm_err = 0;
3452 3460          int             zflg = 0;
3453 3461          boolean_t       waited = B_FALSE;
3454 3462  
3455 3463          ZFS_ENTER(zfsvfs);
3456 3464          ZFS_VERIFY_ZP(sdzp);
3457 3465          zilog = zfsvfs->z_log;
3458 3466  
3459 3467          /*
3460 3468           * Make sure we have the real vp for the target directory.
3461 3469           */
3462 3470          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3463 3471                  tdvp = realvp;
3464 3472  
3465 3473          tdzp = VTOZ(tdvp);
3466 3474          ZFS_VERIFY_ZP(tdzp);
3467 3475  
3468 3476          /*
3469 3477           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3470 3478           * ctldir appear to have the same v_vfsp.
3471 3479           */
3472 3480          if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3473 3481                  ZFS_EXIT(zfsvfs);
3474 3482                  return (SET_ERROR(EXDEV));
3475 3483          }
3476 3484  
3477 3485          if (zfsvfs->z_utf8 && u8_validate(tnm,
3478 3486              strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3479 3487                  ZFS_EXIT(zfsvfs);
3480 3488                  return (SET_ERROR(EILSEQ));
3481 3489          }
3482 3490  
3483 3491          if (flags & FIGNORECASE)
3484 3492                  zflg |= ZCILOOK;
3485 3493  
3486 3494  top:
3487 3495          szp = NULL;
3488 3496          tzp = NULL;
3489 3497          zl = NULL;
3490 3498  
3491 3499          /*
3492 3500           * This is to prevent the creation of links into attribute space
3493 3501           * by renaming a linked file into/outof an attribute directory.
3494 3502           * See the comment in zfs_link() for why this is considered bad.
3495 3503           */
3496 3504          if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3497 3505                  ZFS_EXIT(zfsvfs);
3498 3506                  return (SET_ERROR(EINVAL));
3499 3507          }
3500 3508  
3501 3509          /*
3502 3510           * Lock source and target directory entries.  To prevent deadlock,
3503 3511           * a lock ordering must be defined.  We lock the directory with
3504 3512           * the smallest object id first, or if it's a tie, the one with
3505 3513           * the lexically first name.
3506 3514           */
3507 3515          if (sdzp->z_id < tdzp->z_id) {
3508 3516                  cmp = -1;
3509 3517          } else if (sdzp->z_id > tdzp->z_id) {
3510 3518                  cmp = 1;
3511 3519          } else {
3512 3520                  /*
3513 3521                   * First compare the two name arguments without
3514 3522                   * considering any case folding.
3515 3523                   */
3516 3524                  int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3517 3525  
3518 3526                  cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3519 3527                  ASSERT(error == 0 || !zfsvfs->z_utf8);
3520 3528                  if (cmp == 0) {
3521 3529                          /*
3522 3530                           * POSIX: "If the old argument and the new argument
3523 3531                           * both refer to links to the same existing file,
3524 3532                           * the rename() function shall return successfully
3525 3533                           * and perform no other action."
3526 3534                           */
3527 3535                          ZFS_EXIT(zfsvfs);
3528 3536                          return (0);
3529 3537                  }
3530 3538                  /*
3531 3539                   * If the file system is case-folding, then we may
3532 3540                   * have some more checking to do.  A case-folding file
3533 3541                   * system is either supporting mixed case sensitivity
3534 3542                   * access or is completely case-insensitive.  Note
3535 3543                   * that the file system is always case preserving.
3536 3544                   *
3537 3545                   * In mixed sensitivity mode case sensitive behavior
3538 3546                   * is the default.  FIGNORECASE must be used to
3539 3547                   * explicitly request case insensitive behavior.
3540 3548                   *
3541 3549                   * If the source and target names provided differ only
3542 3550                   * by case (e.g., a request to rename 'tim' to 'Tim'),
3543 3551                   * we will treat this as a special case in the
3544 3552                   * case-insensitive mode: as long as the source name
3545 3553                   * is an exact match, we will allow this to proceed as
3546 3554                   * a name-change request.
3547 3555                   */
3548 3556                  if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3549 3557                      (zfsvfs->z_case == ZFS_CASE_MIXED &&
3550 3558                      flags & FIGNORECASE)) &&
3551 3559                      u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3552 3560                      &error) == 0) {
3553 3561                          /*
3554 3562                           * case preserving rename request, require exact
3555 3563                           * name matches
3556 3564                           */
3557 3565                          zflg |= ZCIEXACT;
3558 3566                          zflg &= ~ZCILOOK;
3559 3567                  }
3560 3568          }
3561 3569  
3562 3570          /*
3563 3571           * If the source and destination directories are the same, we should
3564 3572           * grab the z_name_lock of that directory only once.
3565 3573           */
3566 3574          if (sdzp == tdzp) {
3567 3575                  zflg |= ZHAVELOCK;
3568 3576                  rw_enter(&sdzp->z_name_lock, RW_READER);
3569 3577          }
3570 3578  
3571 3579          if (cmp < 0) {
3572 3580                  serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3573 3581                      ZEXISTS | zflg, NULL, NULL);
3574 3582                  terr = zfs_dirent_lock(&tdl,
3575 3583                      tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3576 3584          } else {
3577 3585                  terr = zfs_dirent_lock(&tdl,
3578 3586                      tdzp, tnm, &tzp, zflg, NULL, NULL);
3579 3587                  serr = zfs_dirent_lock(&sdl,
3580 3588                      sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3581 3589                      NULL, NULL);
3582 3590          }
3583 3591  
3584 3592          if (serr) {
3585 3593                  /*
3586 3594                   * Source entry invalid or not there.
3587 3595                   */
3588 3596                  if (!terr) {
3589 3597                          zfs_dirent_unlock(tdl);
3590 3598                          if (tzp)
3591 3599                                  VN_RELE(ZTOV(tzp));
3592 3600                  }
3593 3601  
3594 3602                  if (sdzp == tdzp)
3595 3603                          rw_exit(&sdzp->z_name_lock);
3596 3604  
3597 3605                  if (strcmp(snm, "..") == 0)
3598 3606                          serr = SET_ERROR(EINVAL);
3599 3607                  ZFS_EXIT(zfsvfs);
3600 3608                  return (serr);
3601 3609          }
3602 3610          if (terr) {
3603 3611                  zfs_dirent_unlock(sdl);
3604 3612                  VN_RELE(ZTOV(szp));
3605 3613  
3606 3614                  if (sdzp == tdzp)
3607 3615                          rw_exit(&sdzp->z_name_lock);
3608 3616  
3609 3617                  if (strcmp(tnm, "..") == 0)
3610 3618                          terr = SET_ERROR(EINVAL);
3611 3619                  ZFS_EXIT(zfsvfs);
3612 3620                  return (terr);
3613 3621          }
3614 3622  
3615 3623          /*
3616 3624           * Must have write access at the source to remove the old entry
3617 3625           * and write access at the target to create the new entry.
3618 3626           * Note that if target and source are the same, this can be
3619 3627           * done in a single check.
3620 3628           */
3621 3629  
3622 3630          if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3623 3631                  goto out;
3624 3632  
3625 3633          if (ZTOV(szp)->v_type == VDIR) {
3626 3634                  /*
3627 3635                   * Check to make sure rename is valid.
3628 3636                   * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3629 3637                   */
3630 3638                  if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3631 3639                          goto out;
3632 3640          }
3633 3641  
3634 3642          /*
3635 3643           * Does target exist?
3636 3644           */
3637 3645          if (tzp) {
3638 3646                  /*
3639 3647                   * Source and target must be the same type.
3640 3648                   */
3641 3649                  if (ZTOV(szp)->v_type == VDIR) {
3642 3650                          if (ZTOV(tzp)->v_type != VDIR) {
3643 3651                                  error = SET_ERROR(ENOTDIR);
3644 3652                                  goto out;
3645 3653                          }
3646 3654                  } else {
3647 3655                          if (ZTOV(tzp)->v_type == VDIR) {
3648 3656                                  error = SET_ERROR(EISDIR);
3649 3657                                  goto out;
3650 3658                          }
3651 3659                  }
3652 3660                  /*
3653 3661                   * POSIX dictates that when the source and target
3654 3662                   * entries refer to the same file object, rename
3655 3663                   * must do nothing and exit without error.
3656 3664                   */
3657 3665                  if (szp->z_id == tzp->z_id) {
3658 3666                          error = 0;
3659 3667                          goto out;
3660 3668                  }
3661 3669          }
3662 3670  
3663 3671          vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
3664 3672          if (tzp)
3665 3673                  vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3666 3674  
3667 3675          /*
3668 3676           * notify the target directory if it is not the same
3669 3677           * as source directory.
3670 3678           */
3671 3679          if (tdvp != sdvp) {
3672 3680                  vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3673 3681          }
3674 3682  
3675 3683          tx = dmu_tx_create(zfsvfs->z_os);
3676 3684          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3677 3685          dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3678 3686          dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3679 3687          dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3680 3688          if (sdzp != tdzp) {
3681 3689                  dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3682 3690                  zfs_sa_upgrade_txholds(tx, tdzp);
3683 3691          }
3684 3692          if (tzp) {
3685 3693                  dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3686 3694                  zfs_sa_upgrade_txholds(tx, tzp);
3687 3695          }
3688 3696  
3689 3697          zfs_sa_upgrade_txholds(tx, szp);
3690 3698          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3691 3699          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3692 3700          if (error) {
3693 3701                  if (zl != NULL)
3694 3702                          zfs_rename_unlock(&zl);
3695 3703                  zfs_dirent_unlock(sdl);
3696 3704                  zfs_dirent_unlock(tdl);
3697 3705  
3698 3706                  if (sdzp == tdzp)
3699 3707                          rw_exit(&sdzp->z_name_lock);
3700 3708  
3701 3709                  VN_RELE(ZTOV(szp));
3702 3710                  if (tzp)
3703 3711                          VN_RELE(ZTOV(tzp));
3704 3712                  if (error == ERESTART) {
3705 3713                          waited = B_TRUE;
3706 3714                          dmu_tx_wait(tx);
3707 3715                          dmu_tx_abort(tx);
3708 3716                          goto top;
3709 3717                  }
3710 3718                  dmu_tx_abort(tx);
3711 3719                  ZFS_EXIT(zfsvfs);
3712 3720                  return (error);
3713 3721          }
3714 3722  
3715 3723          if (tzp)        /* Attempt to remove the existing target */
3716 3724                  error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3717 3725  
3718 3726          if (error == 0) {
3719 3727                  error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3720 3728                  if (error == 0) {
3721 3729                          szp->z_pflags |= ZFS_AV_MODIFIED;
3722 3730  
3723 3731                          error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3724 3732                              (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3725 3733                          ASSERT0(error);
3726 3734  
3727 3735                          error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3728 3736                          if (error == 0) {
3729 3737                                  zfs_log_rename(zilog, tx, TX_RENAME |
3730 3738                                      (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3731 3739                                      sdl->dl_name, tdzp, tdl->dl_name, szp);
3732 3740  
3733 3741                                  /*
3734 3742                                   * Update path information for the target vnode
3735 3743                                   */
3736 3744                                  vn_renamepath(tdvp, ZTOV(szp), tnm,
3737 3745                                      strlen(tnm));
3738 3746                          } else {
3739 3747                                  /*
3740 3748                                   * At this point, we have successfully created
3741 3749                                   * the target name, but have failed to remove
3742 3750                                   * the source name.  Since the create was done
3743 3751                                   * with the ZRENAMING flag, there are
3744 3752                                   * complications; for one, the link count is
3745 3753                                   * wrong.  The easiest way to deal with this
3746 3754                                   * is to remove the newly created target, and
3747 3755                                   * return the original error.  This must
3748 3756                                   * succeed; fortunately, it is very unlikely to
3749 3757                                   * fail, since we just created it.
3750 3758                                   */
3751 3759                                  VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3752 3760                                      ZRENAMING, NULL), ==, 0);
3753 3761                          }
  
    | 
      ↓ open down ↓ | 
    907 lines elided | 
    
      ↑ open up ↑ | 
  
3754 3762                  }
3755 3763          }
3756 3764  
3757 3765          dmu_tx_commit(tx);
3758 3766  
3759 3767          if (tzp && rm_err == 0)
3760 3768                  vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3761 3769  
3762 3770          if (error == 0) {
3763 3771                  vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3764      -                /* notify the target dir if it is not the same as source dir */
3765      -                if (tdvp != sdvp)
3766      -                        vnevent_rename_dest_dir(tdvp, ct);
     3772 +                vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3767 3773          }
3768 3774  out:
3769 3775          if (zl != NULL)
3770 3776                  zfs_rename_unlock(&zl);
3771 3777  
3772 3778          zfs_dirent_unlock(sdl);
3773 3779          zfs_dirent_unlock(tdl);
3774 3780  
3775 3781          if (sdzp == tdzp)
3776 3782                  rw_exit(&sdzp->z_name_lock);
3777 3783  
3778 3784  
3779 3785          VN_RELE(ZTOV(szp));
3780 3786          if (tzp)
3781 3787                  VN_RELE(ZTOV(tzp));
3782 3788  
3783 3789          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3784 3790                  zil_commit(zilog, 0);
3785 3791  
3786 3792          ZFS_EXIT(zfsvfs);
3787 3793          return (error);
3788 3794  }
3789 3795  
3790 3796  /*
3791 3797   * Insert the indicated symbolic reference entry into the directory.
3792 3798   *
3793 3799   *      IN:     dvp     - Directory to contain new symbolic link.
3794 3800   *              link    - Name for new symlink entry.
3795 3801   *              vap     - Attributes of new entry.
3796 3802   *              cr      - credentials of caller.
3797 3803   *              ct      - caller context
3798 3804   *              flags   - case flags
3799 3805   *
3800 3806   *      RETURN: 0 on success, error code on failure.
3801 3807   *
3802 3808   * Timestamps:
3803 3809   *      dvp - ctime|mtime updated
3804 3810   */
3805 3811  /*ARGSUSED*/
3806 3812  static int
3807 3813  zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3808 3814      caller_context_t *ct, int flags)
3809 3815  {
3810 3816          znode_t         *zp, *dzp = VTOZ(dvp);
3811 3817          zfs_dirlock_t   *dl;
3812 3818          dmu_tx_t        *tx;
3813 3819          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3814 3820          zilog_t         *zilog;
3815 3821          uint64_t        len = strlen(link);
3816 3822          int             error;
3817 3823          int             zflg = ZNEW;
3818 3824          zfs_acl_ids_t   acl_ids;
3819 3825          boolean_t       fuid_dirtied;
3820 3826          uint64_t        txtype = TX_SYMLINK;
3821 3827          boolean_t       waited = B_FALSE;
3822 3828  
3823 3829          ASSERT(vap->va_type == VLNK);
3824 3830  
3825 3831          ZFS_ENTER(zfsvfs);
3826 3832          ZFS_VERIFY_ZP(dzp);
3827 3833          zilog = zfsvfs->z_log;
3828 3834  
3829 3835          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3830 3836              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3831 3837                  ZFS_EXIT(zfsvfs);
3832 3838                  return (SET_ERROR(EILSEQ));
3833 3839          }
3834 3840          if (flags & FIGNORECASE)
3835 3841                  zflg |= ZCILOOK;
3836 3842  
3837 3843          if (len > MAXPATHLEN) {
3838 3844                  ZFS_EXIT(zfsvfs);
3839 3845                  return (SET_ERROR(ENAMETOOLONG));
3840 3846          }
3841 3847  
3842 3848          if ((error = zfs_acl_ids_create(dzp, 0,
3843 3849              vap, cr, NULL, &acl_ids)) != 0) {
3844 3850                  ZFS_EXIT(zfsvfs);
3845 3851                  return (error);
3846 3852          }
3847 3853  top:
3848 3854          /*
3849 3855           * Attempt to lock directory; fail if entry already exists.
3850 3856           */
3851 3857          error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3852 3858          if (error) {
3853 3859                  zfs_acl_ids_free(&acl_ids);
3854 3860                  ZFS_EXIT(zfsvfs);
3855 3861                  return (error);
3856 3862          }
3857 3863  
3858 3864          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3859 3865                  zfs_acl_ids_free(&acl_ids);
3860 3866                  zfs_dirent_unlock(dl);
3861 3867                  ZFS_EXIT(zfsvfs);
3862 3868                  return (error);
3863 3869          }
3864 3870  
3865 3871          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3866 3872                  zfs_acl_ids_free(&acl_ids);
3867 3873                  zfs_dirent_unlock(dl);
3868 3874                  ZFS_EXIT(zfsvfs);
3869 3875                  return (SET_ERROR(EDQUOT));
3870 3876          }
3871 3877          tx = dmu_tx_create(zfsvfs->z_os);
3872 3878          fuid_dirtied = zfsvfs->z_fuid_dirty;
3873 3879          dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3874 3880          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3875 3881          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3876 3882              ZFS_SA_BASE_ATTR_SIZE + len);
3877 3883          dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3878 3884          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3879 3885                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3880 3886                      acl_ids.z_aclp->z_acl_bytes);
3881 3887          }
3882 3888          if (fuid_dirtied)
3883 3889                  zfs_fuid_txhold(zfsvfs, tx);
3884 3890          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3885 3891          if (error) {
3886 3892                  zfs_dirent_unlock(dl);
3887 3893                  if (error == ERESTART) {
3888 3894                          waited = B_TRUE;
3889 3895                          dmu_tx_wait(tx);
3890 3896                          dmu_tx_abort(tx);
3891 3897                          goto top;
3892 3898                  }
3893 3899                  zfs_acl_ids_free(&acl_ids);
3894 3900                  dmu_tx_abort(tx);
3895 3901                  ZFS_EXIT(zfsvfs);
3896 3902                  return (error);
3897 3903          }
3898 3904  
3899 3905          /*
3900 3906           * Create a new object for the symlink.
3901 3907           * for version 4 ZPL datsets the symlink will be an SA attribute
3902 3908           */
3903 3909          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3904 3910  
3905 3911          if (fuid_dirtied)
3906 3912                  zfs_fuid_sync(zfsvfs, tx);
3907 3913  
3908 3914          mutex_enter(&zp->z_lock);
3909 3915          if (zp->z_is_sa)
3910 3916                  error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3911 3917                      link, len, tx);
3912 3918          else
3913 3919                  zfs_sa_symlink(zp, link, len, tx);
3914 3920          mutex_exit(&zp->z_lock);
3915 3921  
3916 3922          zp->z_size = len;
3917 3923          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3918 3924              &zp->z_size, sizeof (zp->z_size), tx);
3919 3925          /*
3920 3926           * Insert the new object into the directory.
3921 3927           */
3922 3928          (void) zfs_link_create(dl, zp, tx, ZNEW);
3923 3929  
3924 3930          if (flags & FIGNORECASE)
3925 3931                  txtype |= TX_CI;
3926 3932          zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3927 3933  
3928 3934          zfs_acl_ids_free(&acl_ids);
3929 3935  
3930 3936          dmu_tx_commit(tx);
3931 3937  
3932 3938          zfs_dirent_unlock(dl);
3933 3939  
3934 3940          VN_RELE(ZTOV(zp));
3935 3941  
3936 3942          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3937 3943                  zil_commit(zilog, 0);
3938 3944  
3939 3945          ZFS_EXIT(zfsvfs);
3940 3946          return (error);
3941 3947  }
3942 3948  
3943 3949  /*
3944 3950   * Return, in the buffer contained in the provided uio structure,
3945 3951   * the symbolic path referred to by vp.
3946 3952   *
3947 3953   *      IN:     vp      - vnode of symbolic link.
3948 3954   *              uio     - structure to contain the link path.
3949 3955   *              cr      - credentials of caller.
3950 3956   *              ct      - caller context
3951 3957   *
3952 3958   *      OUT:    uio     - structure containing the link path.
3953 3959   *
3954 3960   *      RETURN: 0 on success, error code on failure.
3955 3961   *
3956 3962   * Timestamps:
3957 3963   *      vp - atime updated
3958 3964   */
3959 3965  /* ARGSUSED */
3960 3966  static int
3961 3967  zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3962 3968  {
3963 3969          znode_t         *zp = VTOZ(vp);
3964 3970          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3965 3971          int             error;
3966 3972  
3967 3973          ZFS_ENTER(zfsvfs);
3968 3974          ZFS_VERIFY_ZP(zp);
3969 3975  
3970 3976          mutex_enter(&zp->z_lock);
3971 3977          if (zp->z_is_sa)
3972 3978                  error = sa_lookup_uio(zp->z_sa_hdl,
3973 3979                      SA_ZPL_SYMLINK(zfsvfs), uio);
3974 3980          else
3975 3981                  error = zfs_sa_readlink(zp, uio);
3976 3982          mutex_exit(&zp->z_lock);
3977 3983  
3978 3984          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3979 3985  
3980 3986          ZFS_EXIT(zfsvfs);
3981 3987          return (error);
3982 3988  }
3983 3989  
3984 3990  /*
3985 3991   * Insert a new entry into directory tdvp referencing svp.
3986 3992   *
3987 3993   *      IN:     tdvp    - Directory to contain new entry.
3988 3994   *              svp     - vnode of new entry.
3989 3995   *              name    - name of new entry.
3990 3996   *              cr      - credentials of caller.
3991 3997   *              ct      - caller context
3992 3998   *
3993 3999   *      RETURN: 0 on success, error code on failure.
3994 4000   *
3995 4001   * Timestamps:
3996 4002   *      tdvp - ctime|mtime updated
3997 4003   *       svp - ctime updated
3998 4004   */
3999 4005  /* ARGSUSED */
4000 4006  static int
4001 4007  zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4002 4008      caller_context_t *ct, int flags)
4003 4009  {
4004 4010          znode_t         *dzp = VTOZ(tdvp);
4005 4011          znode_t         *tzp, *szp;
4006 4012          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
4007 4013          zilog_t         *zilog;
4008 4014          zfs_dirlock_t   *dl;
4009 4015          dmu_tx_t        *tx;
4010 4016          vnode_t         *realvp;
4011 4017          int             error;
4012 4018          int             zf = ZNEW;
4013 4019          uint64_t        parent;
4014 4020          uid_t           owner;
4015 4021          boolean_t       waited = B_FALSE;
4016 4022  
4017 4023          ASSERT(tdvp->v_type == VDIR);
4018 4024  
4019 4025          ZFS_ENTER(zfsvfs);
4020 4026          ZFS_VERIFY_ZP(dzp);
4021 4027          zilog = zfsvfs->z_log;
4022 4028  
4023 4029          if (VOP_REALVP(svp, &realvp, ct) == 0)
4024 4030                  svp = realvp;
4025 4031  
4026 4032          /*
4027 4033           * POSIX dictates that we return EPERM here.
4028 4034           * Better choices include ENOTSUP or EISDIR.
4029 4035           */
4030 4036          if (svp->v_type == VDIR) {
4031 4037                  ZFS_EXIT(zfsvfs);
4032 4038                  return (SET_ERROR(EPERM));
4033 4039          }
4034 4040  
4035 4041          szp = VTOZ(svp);
4036 4042          ZFS_VERIFY_ZP(szp);
4037 4043  
4038 4044          /*
4039 4045           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4040 4046           * ctldir appear to have the same v_vfsp.
4041 4047           */
4042 4048          if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4043 4049                  ZFS_EXIT(zfsvfs);
4044 4050                  return (SET_ERROR(EXDEV));
4045 4051          }
4046 4052  
4047 4053          /* Prevent links to .zfs/shares files */
4048 4054  
4049 4055          if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4050 4056              &parent, sizeof (uint64_t))) != 0) {
4051 4057                  ZFS_EXIT(zfsvfs);
4052 4058                  return (error);
4053 4059          }
4054 4060          if (parent == zfsvfs->z_shares_dir) {
4055 4061                  ZFS_EXIT(zfsvfs);
4056 4062                  return (SET_ERROR(EPERM));
4057 4063          }
4058 4064  
4059 4065          if (zfsvfs->z_utf8 && u8_validate(name,
4060 4066              strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4061 4067                  ZFS_EXIT(zfsvfs);
4062 4068                  return (SET_ERROR(EILSEQ));
4063 4069          }
4064 4070          if (flags & FIGNORECASE)
4065 4071                  zf |= ZCILOOK;
4066 4072  
4067 4073          /*
4068 4074           * We do not support links between attributes and non-attributes
4069 4075           * because of the potential security risk of creating links
4070 4076           * into "normal" file space in order to circumvent restrictions
4071 4077           * imposed in attribute space.
4072 4078           */
4073 4079          if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4074 4080                  ZFS_EXIT(zfsvfs);
4075 4081                  return (SET_ERROR(EINVAL));
4076 4082          }
4077 4083  
4078 4084  
4079 4085          owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4080 4086          if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4081 4087                  ZFS_EXIT(zfsvfs);
4082 4088                  return (SET_ERROR(EPERM));
4083 4089          }
4084 4090  
4085 4091          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4086 4092                  ZFS_EXIT(zfsvfs);
4087 4093                  return (error);
4088 4094          }
4089 4095  
4090 4096  top:
4091 4097          /*
4092 4098           * Attempt to lock directory; fail if entry already exists.
4093 4099           */
4094 4100          error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4095 4101          if (error) {
4096 4102                  ZFS_EXIT(zfsvfs);
4097 4103                  return (error);
4098 4104          }
4099 4105  
4100 4106          tx = dmu_tx_create(zfsvfs->z_os);
4101 4107          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4102 4108          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4103 4109          zfs_sa_upgrade_txholds(tx, szp);
4104 4110          zfs_sa_upgrade_txholds(tx, dzp);
4105 4111          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4106 4112          if (error) {
4107 4113                  zfs_dirent_unlock(dl);
4108 4114                  if (error == ERESTART) {
4109 4115                          waited = B_TRUE;
4110 4116                          dmu_tx_wait(tx);
4111 4117                          dmu_tx_abort(tx);
4112 4118                          goto top;
4113 4119                  }
4114 4120                  dmu_tx_abort(tx);
4115 4121                  ZFS_EXIT(zfsvfs);
4116 4122                  return (error);
4117 4123          }
4118 4124  
4119 4125          error = zfs_link_create(dl, szp, tx, 0);
4120 4126  
4121 4127          if (error == 0) {
4122 4128                  uint64_t txtype = TX_LINK;
4123 4129                  if (flags & FIGNORECASE)
4124 4130                          txtype |= TX_CI;
4125 4131                  zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4126 4132          }
4127 4133  
4128 4134          dmu_tx_commit(tx);
4129 4135  
4130 4136          zfs_dirent_unlock(dl);
4131 4137  
4132 4138          if (error == 0) {
4133 4139                  vnevent_link(svp, ct);
4134 4140          }
4135 4141  
4136 4142          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4137 4143                  zil_commit(zilog, 0);
4138 4144  
4139 4145          ZFS_EXIT(zfsvfs);
4140 4146          return (error);
4141 4147  }
4142 4148  
4143 4149  /*
4144 4150   * zfs_null_putapage() is used when the file system has been force
4145 4151   * unmounted. It just drops the pages.
4146 4152   */
4147 4153  /* ARGSUSED */
4148 4154  static int
4149 4155  zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4150 4156      size_t *lenp, int flags, cred_t *cr)
4151 4157  {
4152 4158          pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4153 4159          return (0);
4154 4160  }
4155 4161  
4156 4162  /*
4157 4163   * Push a page out to disk, klustering if possible.
4158 4164   *
4159 4165   *      IN:     vp      - file to push page to.
4160 4166   *              pp      - page to push.
4161 4167   *              flags   - additional flags.
4162 4168   *              cr      - credentials of caller.
4163 4169   *
4164 4170   *      OUT:    offp    - start of range pushed.
4165 4171   *              lenp    - len of range pushed.
4166 4172   *
4167 4173   *      RETURN: 0 on success, error code on failure.
4168 4174   *
4169 4175   * NOTE: callers must have locked the page to be pushed.  On
4170 4176   * exit, the page (and all other pages in the kluster) must be
4171 4177   * unlocked.
4172 4178   */
4173 4179  /* ARGSUSED */
4174 4180  static int
4175 4181  zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4176 4182      size_t *lenp, int flags, cred_t *cr)
4177 4183  {
4178 4184          znode_t         *zp = VTOZ(vp);
4179 4185          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4180 4186          dmu_tx_t        *tx;
4181 4187          u_offset_t      off, koff;
4182 4188          size_t          len, klen;
4183 4189          int             err;
4184 4190  
4185 4191          off = pp->p_offset;
4186 4192          len = PAGESIZE;
4187 4193          /*
4188 4194           * If our blocksize is bigger than the page size, try to kluster
4189 4195           * multiple pages so that we write a full block (thus avoiding
4190 4196           * a read-modify-write).
4191 4197           */
4192 4198          if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4193 4199                  klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4194 4200                  koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4195 4201                  ASSERT(koff <= zp->z_size);
4196 4202                  if (koff + klen > zp->z_size)
4197 4203                          klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4198 4204                  pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4199 4205          }
4200 4206          ASSERT3U(btop(len), ==, btopr(len));
4201 4207  
4202 4208          /*
4203 4209           * Can't push pages past end-of-file.
4204 4210           */
4205 4211          if (off >= zp->z_size) {
4206 4212                  /* ignore all pages */
4207 4213                  err = 0;
4208 4214                  goto out;
4209 4215          } else if (off + len > zp->z_size) {
4210 4216                  int npages = btopr(zp->z_size - off);
4211 4217                  page_t *trunc;
4212 4218  
4213 4219                  page_list_break(&pp, &trunc, npages);
4214 4220                  /* ignore pages past end of file */
4215 4221                  if (trunc)
4216 4222                          pvn_write_done(trunc, flags);
4217 4223                  len = zp->z_size - off;
4218 4224          }
4219 4225  
4220 4226          if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4221 4227              zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4222 4228                  err = SET_ERROR(EDQUOT);
4223 4229                  goto out;
4224 4230          }
4225 4231          tx = dmu_tx_create(zfsvfs->z_os);
4226 4232          dmu_tx_hold_write(tx, zp->z_id, off, len);
4227 4233  
4228 4234          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4229 4235          zfs_sa_upgrade_txholds(tx, zp);
4230 4236          err = dmu_tx_assign(tx, TXG_WAIT);
4231 4237          if (err != 0) {
4232 4238                  dmu_tx_abort(tx);
4233 4239                  goto out;
4234 4240          }
4235 4241  
4236 4242          if (zp->z_blksz <= PAGESIZE) {
4237 4243                  caddr_t va = zfs_map_page(pp, S_READ);
4238 4244                  ASSERT3U(len, <=, PAGESIZE);
4239 4245                  dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4240 4246                  zfs_unmap_page(pp, va);
4241 4247          } else {
4242 4248                  err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4243 4249          }
4244 4250  
4245 4251          if (err == 0) {
4246 4252                  uint64_t mtime[2], ctime[2];
4247 4253                  sa_bulk_attr_t bulk[3];
4248 4254                  int count = 0;
4249 4255  
4250 4256                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4251 4257                      &mtime, 16);
4252 4258                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4253 4259                      &ctime, 16);
4254 4260                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4255 4261                      &zp->z_pflags, 8);
4256 4262                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4257 4263                      B_TRUE);
4258 4264                  zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4259 4265          }
4260 4266          dmu_tx_commit(tx);
4261 4267  
4262 4268  out:
4263 4269          pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4264 4270          if (offp)
4265 4271                  *offp = off;
4266 4272          if (lenp)
4267 4273                  *lenp = len;
4268 4274  
4269 4275          return (err);
4270 4276  }
4271 4277  
4272 4278  /*
4273 4279   * Copy the portion of the file indicated from pages into the file.
4274 4280   * The pages are stored in a page list attached to the files vnode.
4275 4281   *
4276 4282   *      IN:     vp      - vnode of file to push page data to.
4277 4283   *              off     - position in file to put data.
4278 4284   *              len     - amount of data to write.
4279 4285   *              flags   - flags to control the operation.
4280 4286   *              cr      - credentials of caller.
4281 4287   *              ct      - caller context.
4282 4288   *
4283 4289   *      RETURN: 0 on success, error code on failure.
4284 4290   *
4285 4291   * Timestamps:
4286 4292   *      vp - ctime|mtime updated
4287 4293   */
4288 4294  /*ARGSUSED*/
4289 4295  static int
4290 4296  zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4291 4297      caller_context_t *ct)
4292 4298  {
4293 4299          znode_t         *zp = VTOZ(vp);
4294 4300          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4295 4301          page_t          *pp;
4296 4302          size_t          io_len;
4297 4303          u_offset_t      io_off;
4298 4304          uint_t          blksz;
4299 4305          rl_t            *rl;
4300 4306          int             error = 0;
4301 4307  
4302 4308          ZFS_ENTER(zfsvfs);
4303 4309          ZFS_VERIFY_ZP(zp);
4304 4310  
4305 4311          /*
4306 4312           * There's nothing to do if no data is cached.
4307 4313           */
4308 4314          if (!vn_has_cached_data(vp)) {
4309 4315                  ZFS_EXIT(zfsvfs);
4310 4316                  return (0);
4311 4317          }
4312 4318  
4313 4319          /*
4314 4320           * Align this request to the file block size in case we kluster.
4315 4321           * XXX - this can result in pretty aggresive locking, which can
4316 4322           * impact simultanious read/write access.  One option might be
4317 4323           * to break up long requests (len == 0) into block-by-block
4318 4324           * operations to get narrower locking.
4319 4325           */
4320 4326          blksz = zp->z_blksz;
4321 4327          if (ISP2(blksz))
4322 4328                  io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4323 4329          else
4324 4330                  io_off = 0;
4325 4331          if (len > 0 && ISP2(blksz))
4326 4332                  io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4327 4333          else
4328 4334                  io_len = 0;
4329 4335  
4330 4336          if (io_len == 0) {
4331 4337                  /*
4332 4338                   * Search the entire vp list for pages >= io_off.
4333 4339                   */
4334 4340                  rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4335 4341                  error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4336 4342                  goto out;
4337 4343          }
4338 4344          rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4339 4345  
4340 4346          if (off > zp->z_size) {
4341 4347                  /* past end of file */
4342 4348                  zfs_range_unlock(rl);
4343 4349                  ZFS_EXIT(zfsvfs);
4344 4350                  return (0);
4345 4351          }
4346 4352  
4347 4353          len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4348 4354  
4349 4355          for (off = io_off; io_off < off + len; io_off += io_len) {
4350 4356                  if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4351 4357                          pp = page_lookup(vp, io_off,
4352 4358                              (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4353 4359                  } else {
4354 4360                          pp = page_lookup_nowait(vp, io_off,
4355 4361                              (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4356 4362                  }
4357 4363  
4358 4364                  if (pp != NULL && pvn_getdirty(pp, flags)) {
4359 4365                          int err;
4360 4366  
4361 4367                          /*
4362 4368                           * Found a dirty page to push
4363 4369                           */
4364 4370                          err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4365 4371                          if (err)
4366 4372                                  error = err;
4367 4373                  } else {
4368 4374                          io_len = PAGESIZE;
4369 4375                  }
4370 4376          }
4371 4377  out:
4372 4378          zfs_range_unlock(rl);
4373 4379          if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4374 4380                  zil_commit(zfsvfs->z_log, zp->z_id);
4375 4381          ZFS_EXIT(zfsvfs);
4376 4382          return (error);
4377 4383  }
4378 4384  
4379 4385  /*ARGSUSED*/
4380 4386  void
4381 4387  zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4382 4388  {
4383 4389          znode_t *zp = VTOZ(vp);
4384 4390          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4385 4391          int error;
4386 4392  
4387 4393          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4388 4394          if (zp->z_sa_hdl == NULL) {
4389 4395                  /*
4390 4396                   * The fs has been unmounted, or we did a
4391 4397                   * suspend/resume and this file no longer exists.
4392 4398                   */
4393 4399                  if (vn_has_cached_data(vp)) {
4394 4400                          (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4395 4401                              B_INVAL, cr);
4396 4402                  }
4397 4403  
4398 4404                  mutex_enter(&zp->z_lock);
4399 4405                  mutex_enter(&vp->v_lock);
4400 4406                  ASSERT(vp->v_count == 1);
4401 4407                  vp->v_count = 0;
4402 4408                  mutex_exit(&vp->v_lock);
4403 4409                  mutex_exit(&zp->z_lock);
4404 4410                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
4405 4411                  zfs_znode_free(zp);
4406 4412                  return;
4407 4413          }
4408 4414  
4409 4415          /*
4410 4416           * Attempt to push any data in the page cache.  If this fails
4411 4417           * we will get kicked out later in zfs_zinactive().
4412 4418           */
4413 4419          if (vn_has_cached_data(vp)) {
4414 4420                  (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4415 4421                      cr);
4416 4422          }
4417 4423  
4418 4424          if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4419 4425                  dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4420 4426  
4421 4427                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4422 4428                  zfs_sa_upgrade_txholds(tx, zp);
4423 4429                  error = dmu_tx_assign(tx, TXG_WAIT);
4424 4430                  if (error) {
4425 4431                          dmu_tx_abort(tx);
4426 4432                  } else {
4427 4433                          mutex_enter(&zp->z_lock);
4428 4434                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4429 4435                              (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4430 4436                          zp->z_atime_dirty = 0;
4431 4437                          mutex_exit(&zp->z_lock);
4432 4438                          dmu_tx_commit(tx);
4433 4439                  }
4434 4440          }
4435 4441  
4436 4442          zfs_zinactive(zp);
4437 4443          rw_exit(&zfsvfs->z_teardown_inactive_lock);
4438 4444  }
4439 4445  
4440 4446  /*
4441 4447   * Bounds-check the seek operation.
4442 4448   *
4443 4449   *      IN:     vp      - vnode seeking within
4444 4450   *              ooff    - old file offset
4445 4451   *              noffp   - pointer to new file offset
4446 4452   *              ct      - caller context
4447 4453   *
4448 4454   *      RETURN: 0 on success, EINVAL if new offset invalid.
4449 4455   */
4450 4456  /* ARGSUSED */
4451 4457  static int
4452 4458  zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4453 4459      caller_context_t *ct)
4454 4460  {
4455 4461          if (vp->v_type == VDIR)
4456 4462                  return (0);
4457 4463          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4458 4464  }
4459 4465  
4460 4466  /*
4461 4467   * Pre-filter the generic locking function to trap attempts to place
4462 4468   * a mandatory lock on a memory mapped file.
4463 4469   */
4464 4470  static int
4465 4471  zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4466 4472      flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4467 4473  {
4468 4474          znode_t *zp = VTOZ(vp);
4469 4475          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4470 4476  
4471 4477          ZFS_ENTER(zfsvfs);
4472 4478          ZFS_VERIFY_ZP(zp);
4473 4479  
4474 4480          /*
4475 4481           * We are following the UFS semantics with respect to mapcnt
4476 4482           * here: If we see that the file is mapped already, then we will
4477 4483           * return an error, but we don't worry about races between this
4478 4484           * function and zfs_map().
4479 4485           */
4480 4486          if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4481 4487                  ZFS_EXIT(zfsvfs);
4482 4488                  return (SET_ERROR(EAGAIN));
4483 4489          }
4484 4490          ZFS_EXIT(zfsvfs);
4485 4491          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4486 4492  }
4487 4493  
4488 4494  /*
4489 4495   * If we can't find a page in the cache, we will create a new page
4490 4496   * and fill it with file data.  For efficiency, we may try to fill
4491 4497   * multiple pages at once (klustering) to fill up the supplied page
4492 4498   * list.  Note that the pages to be filled are held with an exclusive
4493 4499   * lock to prevent access by other threads while they are being filled.
4494 4500   */
4495 4501  static int
4496 4502  zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4497 4503      caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4498 4504  {
4499 4505          znode_t *zp = VTOZ(vp);
4500 4506          page_t *pp, *cur_pp;
4501 4507          objset_t *os = zp->z_zfsvfs->z_os;
4502 4508          u_offset_t io_off, total;
4503 4509          size_t io_len;
4504 4510          int err;
4505 4511  
4506 4512          if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4507 4513                  /*
4508 4514                   * We only have a single page, don't bother klustering
4509 4515                   */
4510 4516                  io_off = off;
4511 4517                  io_len = PAGESIZE;
4512 4518                  pp = page_create_va(vp, io_off, io_len,
4513 4519                      PG_EXCL | PG_WAIT, seg, addr);
4514 4520          } else {
4515 4521                  /*
4516 4522                   * Try to find enough pages to fill the page list
4517 4523                   */
4518 4524                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4519 4525                      &io_len, off, plsz, 0);
4520 4526          }
4521 4527          if (pp == NULL) {
4522 4528                  /*
4523 4529                   * The page already exists, nothing to do here.
4524 4530                   */
4525 4531                  *pl = NULL;
4526 4532                  return (0);
4527 4533          }
4528 4534  
4529 4535          /*
4530 4536           * Fill the pages in the kluster.
4531 4537           */
4532 4538          cur_pp = pp;
4533 4539          for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4534 4540                  caddr_t va;
4535 4541  
4536 4542                  ASSERT3U(io_off, ==, cur_pp->p_offset);
4537 4543                  va = zfs_map_page(cur_pp, S_WRITE);
4538 4544                  err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4539 4545                      DMU_READ_PREFETCH);
4540 4546                  zfs_unmap_page(cur_pp, va);
4541 4547                  if (err) {
4542 4548                          /* On error, toss the entire kluster */
4543 4549                          pvn_read_done(pp, B_ERROR);
4544 4550                          /* convert checksum errors into IO errors */
4545 4551                          if (err == ECKSUM)
4546 4552                                  err = SET_ERROR(EIO);
4547 4553                          return (err);
4548 4554                  }
4549 4555                  cur_pp = cur_pp->p_next;
4550 4556          }
4551 4557  
4552 4558          /*
4553 4559           * Fill in the page list array from the kluster starting
4554 4560           * from the desired offset `off'.
4555 4561           * NOTE: the page list will always be null terminated.
4556 4562           */
4557 4563          pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4558 4564          ASSERT(pl == NULL || (*pl)->p_offset == off);
4559 4565  
4560 4566          return (0);
4561 4567  }
4562 4568  
4563 4569  /*
4564 4570   * Return pointers to the pages for the file region [off, off + len]
4565 4571   * in the pl array.  If plsz is greater than len, this function may
4566 4572   * also return page pointers from after the specified region
4567 4573   * (i.e. the region [off, off + plsz]).  These additional pages are
4568 4574   * only returned if they are already in the cache, or were created as
4569 4575   * part of a klustered read.
4570 4576   *
4571 4577   *      IN:     vp      - vnode of file to get data from.
4572 4578   *              off     - position in file to get data from.
4573 4579   *              len     - amount of data to retrieve.
4574 4580   *              plsz    - length of provided page list.
4575 4581   *              seg     - segment to obtain pages for.
4576 4582   *              addr    - virtual address of fault.
4577 4583   *              rw      - mode of created pages.
4578 4584   *              cr      - credentials of caller.
4579 4585   *              ct      - caller context.
4580 4586   *
4581 4587   *      OUT:    protp   - protection mode of created pages.
4582 4588   *              pl      - list of pages created.
4583 4589   *
4584 4590   *      RETURN: 0 on success, error code on failure.
4585 4591   *
4586 4592   * Timestamps:
4587 4593   *      vp - atime updated
4588 4594   */
4589 4595  /* ARGSUSED */
4590 4596  static int
4591 4597  zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4592 4598      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4593 4599      enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4594 4600  {
4595 4601          znode_t         *zp = VTOZ(vp);
4596 4602          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4597 4603          page_t          **pl0 = pl;
4598 4604          int             err = 0;
4599 4605  
4600 4606          /* we do our own caching, faultahead is unnecessary */
4601 4607          if (pl == NULL)
4602 4608                  return (0);
4603 4609          else if (len > plsz)
4604 4610                  len = plsz;
4605 4611          else
4606 4612                  len = P2ROUNDUP(len, PAGESIZE);
4607 4613          ASSERT(plsz >= len);
4608 4614  
4609 4615          ZFS_ENTER(zfsvfs);
4610 4616          ZFS_VERIFY_ZP(zp);
4611 4617  
4612 4618          if (protp)
4613 4619                  *protp = PROT_ALL;
4614 4620  
4615 4621          /*
4616 4622           * Loop through the requested range [off, off + len) looking
4617 4623           * for pages.  If we don't find a page, we will need to create
4618 4624           * a new page and fill it with data from the file.
4619 4625           */
4620 4626          while (len > 0) {
4621 4627                  if (*pl = page_lookup(vp, off, SE_SHARED))
4622 4628                          *(pl+1) = NULL;
4623 4629                  else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4624 4630                          goto out;
4625 4631                  while (*pl) {
4626 4632                          ASSERT3U((*pl)->p_offset, ==, off);
4627 4633                          off += PAGESIZE;
4628 4634                          addr += PAGESIZE;
4629 4635                          if (len > 0) {
4630 4636                                  ASSERT3U(len, >=, PAGESIZE);
4631 4637                                  len -= PAGESIZE;
4632 4638                          }
4633 4639                          ASSERT3U(plsz, >=, PAGESIZE);
4634 4640                          plsz -= PAGESIZE;
4635 4641                          pl++;
4636 4642                  }
4637 4643          }
4638 4644  
4639 4645          /*
4640 4646           * Fill out the page array with any pages already in the cache.
4641 4647           */
4642 4648          while (plsz > 0 &&
4643 4649              (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4644 4650                          off += PAGESIZE;
4645 4651                          plsz -= PAGESIZE;
4646 4652          }
4647 4653  out:
4648 4654          if (err) {
4649 4655                  /*
4650 4656                   * Release any pages we have previously locked.
4651 4657                   */
4652 4658                  while (pl > pl0)
4653 4659                          page_unlock(*--pl);
4654 4660          } else {
4655 4661                  ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4656 4662          }
4657 4663  
4658 4664          *pl = NULL;
4659 4665  
4660 4666          ZFS_EXIT(zfsvfs);
4661 4667          return (err);
4662 4668  }
4663 4669  
4664 4670  /*
4665 4671   * Request a memory map for a section of a file.  This code interacts
4666 4672   * with common code and the VM system as follows:
4667 4673   *
4668 4674   * - common code calls mmap(), which ends up in smmap_common()
4669 4675   * - this calls VOP_MAP(), which takes you into (say) zfs
4670 4676   * - zfs_map() calls as_map(), passing segvn_create() as the callback
4671 4677   * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4672 4678   * - zfs_addmap() updates z_mapcnt
4673 4679   */
4674 4680  /*ARGSUSED*/
4675 4681  static int
4676 4682  zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4677 4683      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4678 4684      caller_context_t *ct)
4679 4685  {
4680 4686          znode_t *zp = VTOZ(vp);
4681 4687          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4682 4688          segvn_crargs_t  vn_a;
4683 4689          int             error;
4684 4690  
4685 4691          ZFS_ENTER(zfsvfs);
4686 4692          ZFS_VERIFY_ZP(zp);
4687 4693  
4688 4694          if ((prot & PROT_WRITE) && (zp->z_pflags &
4689 4695              (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4690 4696                  ZFS_EXIT(zfsvfs);
4691 4697                  return (SET_ERROR(EPERM));
4692 4698          }
4693 4699  
4694 4700          if ((prot & (PROT_READ | PROT_EXEC)) &&
4695 4701              (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4696 4702                  ZFS_EXIT(zfsvfs);
4697 4703                  return (SET_ERROR(EACCES));
4698 4704          }
4699 4705  
4700 4706          if (vp->v_flag & VNOMAP) {
4701 4707                  ZFS_EXIT(zfsvfs);
4702 4708                  return (SET_ERROR(ENOSYS));
4703 4709          }
4704 4710  
4705 4711          if (off < 0 || len > MAXOFFSET_T - off) {
4706 4712                  ZFS_EXIT(zfsvfs);
4707 4713                  return (SET_ERROR(ENXIO));
4708 4714          }
4709 4715  
4710 4716          if (vp->v_type != VREG) {
4711 4717                  ZFS_EXIT(zfsvfs);
4712 4718                  return (SET_ERROR(ENODEV));
4713 4719          }
4714 4720  
4715 4721          /*
4716 4722           * If file is locked, disallow mapping.
4717 4723           */
4718 4724          if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4719 4725                  ZFS_EXIT(zfsvfs);
4720 4726                  return (SET_ERROR(EAGAIN));
4721 4727          }
4722 4728  
4723 4729          as_rangelock(as);
4724 4730          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4725 4731          if (error != 0) {
4726 4732                  as_rangeunlock(as);
4727 4733                  ZFS_EXIT(zfsvfs);
4728 4734                  return (error);
4729 4735          }
4730 4736  
4731 4737          vn_a.vp = vp;
4732 4738          vn_a.offset = (u_offset_t)off;
4733 4739          vn_a.type = flags & MAP_TYPE;
4734 4740          vn_a.prot = prot;
4735 4741          vn_a.maxprot = maxprot;
4736 4742          vn_a.cred = cr;
4737 4743          vn_a.amp = NULL;
4738 4744          vn_a.flags = flags & ~MAP_TYPE;
4739 4745          vn_a.szc = 0;
4740 4746          vn_a.lgrp_mem_policy_flags = 0;
4741 4747  
4742 4748          error = as_map(as, *addrp, len, segvn_create, &vn_a);
4743 4749  
4744 4750          as_rangeunlock(as);
4745 4751          ZFS_EXIT(zfsvfs);
4746 4752          return (error);
4747 4753  }
4748 4754  
4749 4755  /* ARGSUSED */
4750 4756  static int
4751 4757  zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4752 4758      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4753 4759      caller_context_t *ct)
4754 4760  {
4755 4761          uint64_t pages = btopr(len);
4756 4762  
4757 4763          atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4758 4764          return (0);
4759 4765  }
4760 4766  
4761 4767  /*
4762 4768   * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4763 4769   * more accurate mtime for the associated file.  Since we don't have a way of
4764 4770   * detecting when the data was actually modified, we have to resort to
4765 4771   * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4766 4772   * last page is pushed.  The problem occurs when the msync() call is omitted,
4767 4773   * which by far the most common case:
4768 4774   *
4769 4775   *      open()
4770 4776   *      mmap()
4771 4777   *      <modify memory>
4772 4778   *      munmap()
4773 4779   *      close()
4774 4780   *      <time lapse>
4775 4781   *      putpage() via fsflush
4776 4782   *
4777 4783   * If we wait until fsflush to come along, we can have a modification time that
4778 4784   * is some arbitrary point in the future.  In order to prevent this in the
4779 4785   * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4780 4786   * torn down.
4781 4787   */
4782 4788  /* ARGSUSED */
4783 4789  static int
4784 4790  zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4785 4791      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4786 4792      caller_context_t *ct)
4787 4793  {
4788 4794          uint64_t pages = btopr(len);
4789 4795  
4790 4796          ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4791 4797          atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4792 4798  
4793 4799          if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4794 4800              vn_has_cached_data(vp))
4795 4801                  (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4796 4802  
4797 4803          return (0);
4798 4804  }
4799 4805  
4800 4806  /*
4801 4807   * Free or allocate space in a file.  Currently, this function only
4802 4808   * supports the `F_FREESP' command.  However, this command is somewhat
4803 4809   * misnamed, as its functionality includes the ability to allocate as
4804 4810   * well as free space.
4805 4811   *
4806 4812   *      IN:     vp      - vnode of file to free data in.
4807 4813   *              cmd     - action to take (only F_FREESP supported).
4808 4814   *              bfp     - section of file to free/alloc.
4809 4815   *              flag    - current file open mode flags.
4810 4816   *              offset  - current file offset.
4811 4817   *              cr      - credentials of caller [UNUSED].
4812 4818   *              ct      - caller context.
4813 4819   *
4814 4820   *      RETURN: 0 on success, error code on failure.
4815 4821   *
4816 4822   * Timestamps:
4817 4823   *      vp - ctime|mtime updated
4818 4824   */
4819 4825  /* ARGSUSED */
4820 4826  static int
4821 4827  zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4822 4828      offset_t offset, cred_t *cr, caller_context_t *ct)
4823 4829  {
4824 4830          znode_t         *zp = VTOZ(vp);
4825 4831          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4826 4832          uint64_t        off, len;
4827 4833          int             error;
4828 4834  
4829 4835          ZFS_ENTER(zfsvfs);
4830 4836          ZFS_VERIFY_ZP(zp);
4831 4837  
4832 4838          if (cmd != F_FREESP) {
4833 4839                  ZFS_EXIT(zfsvfs);
4834 4840                  return (SET_ERROR(EINVAL));
4835 4841          }
4836 4842  
4837 4843          /*
4838 4844           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
4839 4845           * callers might not be able to detect properly that we are read-only,
4840 4846           * so check it explicitly here.
4841 4847           */
4842 4848          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
4843 4849                  ZFS_EXIT(zfsvfs);
4844 4850                  return (SET_ERROR(EROFS));
4845 4851          }
4846 4852  
4847 4853          if (error = convoff(vp, bfp, 0, offset)) {
4848 4854                  ZFS_EXIT(zfsvfs);
4849 4855                  return (error);
4850 4856          }
4851 4857  
  
    | 
      ↓ open down ↓ | 
    1075 lines elided | 
    
      ↑ open up ↑ | 
  
4852 4858          if (bfp->l_len < 0) {
4853 4859                  ZFS_EXIT(zfsvfs);
4854 4860                  return (SET_ERROR(EINVAL));
4855 4861          }
4856 4862  
4857 4863          off = bfp->l_start;
4858 4864          len = bfp->l_len; /* 0 means from off to end of file */
4859 4865  
4860 4866          error = zfs_freesp(zp, off, len, flag, TRUE);
4861 4867  
4862      -        if (error == 0 && off == 0 && len == 0)
4863      -                vnevent_truncate(ZTOV(zp), ct);
     4868 +        if (error == 0 && len == 0) {
     4869 +                if (off == 0) {
     4870 +                        vnevent_truncate(ZTOV(zp), ct);
     4871 +                } else {
     4872 +                        vnevent_resize(ZTOV(zp), ct);
     4873 +                }
     4874 +        }
4864 4875  
4865 4876          ZFS_EXIT(zfsvfs);
4866 4877          return (error);
4867 4878  }
4868 4879  
4869 4880  /*ARGSUSED*/
4870 4881  static int
4871 4882  zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4872 4883  {
4873 4884          znode_t         *zp = VTOZ(vp);
4874 4885          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4875 4886          uint32_t        gen;
4876 4887          uint64_t        gen64;
4877 4888          uint64_t        object = zp->z_id;
4878 4889          zfid_short_t    *zfid;
4879 4890          int             size, i, error;
4880 4891  
4881 4892          ZFS_ENTER(zfsvfs);
4882 4893          ZFS_VERIFY_ZP(zp);
4883 4894  
4884 4895          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4885 4896              &gen64, sizeof (uint64_t))) != 0) {
4886 4897                  ZFS_EXIT(zfsvfs);
4887 4898                  return (error);
4888 4899          }
4889 4900  
4890 4901          gen = (uint32_t)gen64;
4891 4902  
4892 4903          size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4893 4904          if (fidp->fid_len < size) {
4894 4905                  fidp->fid_len = size;
4895 4906                  ZFS_EXIT(zfsvfs);
4896 4907                  return (SET_ERROR(ENOSPC));
4897 4908          }
4898 4909  
4899 4910          zfid = (zfid_short_t *)fidp;
4900 4911  
4901 4912          zfid->zf_len = size;
4902 4913  
4903 4914          for (i = 0; i < sizeof (zfid->zf_object); i++)
4904 4915                  zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4905 4916  
4906 4917          /* Must have a non-zero generation number to distinguish from .zfs */
4907 4918          if (gen == 0)
4908 4919                  gen = 1;
4909 4920          for (i = 0; i < sizeof (zfid->zf_gen); i++)
4910 4921                  zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4911 4922  
4912 4923          if (size == LONG_FID_LEN) {
4913 4924                  uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
4914 4925                  zfid_long_t     *zlfid;
4915 4926  
4916 4927                  zlfid = (zfid_long_t *)fidp;
4917 4928  
4918 4929                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4919 4930                          zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4920 4931  
4921 4932                  /* XXX - this should be the generation number for the objset */
4922 4933                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4923 4934                          zlfid->zf_setgen[i] = 0;
4924 4935          }
4925 4936  
4926 4937          ZFS_EXIT(zfsvfs);
4927 4938          return (0);
4928 4939  }
4929 4940  
4930 4941  static int
4931 4942  zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4932 4943      caller_context_t *ct)
4933 4944  {
4934 4945          znode_t         *zp, *xzp;
4935 4946          zfsvfs_t        *zfsvfs;
4936 4947          zfs_dirlock_t   *dl;
4937 4948          int             error;
4938 4949  
4939 4950          switch (cmd) {
4940 4951          case _PC_LINK_MAX:
4941 4952                  *valp = ULONG_MAX;
4942 4953                  return (0);
4943 4954  
4944 4955          case _PC_FILESIZEBITS:
4945 4956                  *valp = 64;
4946 4957                  return (0);
4947 4958  
4948 4959          case _PC_XATTR_EXISTS:
4949 4960                  zp = VTOZ(vp);
4950 4961                  zfsvfs = zp->z_zfsvfs;
4951 4962                  ZFS_ENTER(zfsvfs);
4952 4963                  ZFS_VERIFY_ZP(zp);
4953 4964                  *valp = 0;
4954 4965                  error = zfs_dirent_lock(&dl, zp, "", &xzp,
4955 4966                      ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4956 4967                  if (error == 0) {
4957 4968                          zfs_dirent_unlock(dl);
4958 4969                          if (!zfs_dirempty(xzp))
4959 4970                                  *valp = 1;
4960 4971                          VN_RELE(ZTOV(xzp));
4961 4972                  } else if (error == ENOENT) {
4962 4973                          /*
4963 4974                           * If there aren't extended attributes, it's the
4964 4975                           * same as having zero of them.
4965 4976                           */
4966 4977                          error = 0;
4967 4978                  }
4968 4979                  ZFS_EXIT(zfsvfs);
4969 4980                  return (error);
4970 4981  
4971 4982          case _PC_SATTR_ENABLED:
4972 4983          case _PC_SATTR_EXISTS:
4973 4984                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4974 4985                      (vp->v_type == VREG || vp->v_type == VDIR);
4975 4986                  return (0);
4976 4987  
4977 4988          case _PC_ACCESS_FILTERING:
4978 4989                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4979 4990                      vp->v_type == VDIR;
4980 4991                  return (0);
4981 4992  
4982 4993          case _PC_ACL_ENABLED:
4983 4994                  *valp = _ACL_ACE_ENABLED;
4984 4995                  return (0);
4985 4996  
4986 4997          case _PC_MIN_HOLE_SIZE:
4987 4998                  *valp = (ulong_t)SPA_MINBLOCKSIZE;
4988 4999                  return (0);
4989 5000  
4990 5001          case _PC_TIMESTAMP_RESOLUTION:
4991 5002                  /* nanosecond timestamp resolution */
4992 5003                  *valp = 1L;
4993 5004                  return (0);
4994 5005  
4995 5006          default:
4996 5007                  return (fs_pathconf(vp, cmd, valp, cr, ct));
4997 5008          }
4998 5009  }
4999 5010  
5000 5011  /*ARGSUSED*/
5001 5012  static int
5002 5013  zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5003 5014      caller_context_t *ct)
5004 5015  {
5005 5016          znode_t *zp = VTOZ(vp);
5006 5017          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5007 5018          int error;
5008 5019          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5009 5020  
5010 5021          ZFS_ENTER(zfsvfs);
5011 5022          ZFS_VERIFY_ZP(zp);
5012 5023          error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5013 5024          ZFS_EXIT(zfsvfs);
5014 5025  
5015 5026          return (error);
5016 5027  }
5017 5028  
5018 5029  /*ARGSUSED*/
5019 5030  static int
5020 5031  zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5021 5032      caller_context_t *ct)
5022 5033  {
5023 5034          znode_t *zp = VTOZ(vp);
5024 5035          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5025 5036          int error;
5026 5037          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5027 5038          zilog_t *zilog = zfsvfs->z_log;
5028 5039  
5029 5040          ZFS_ENTER(zfsvfs);
5030 5041          ZFS_VERIFY_ZP(zp);
5031 5042  
5032 5043          error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5033 5044  
5034 5045          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5035 5046                  zil_commit(zilog, 0);
5036 5047  
5037 5048          ZFS_EXIT(zfsvfs);
5038 5049          return (error);
5039 5050  }
5040 5051  
5041 5052  /*
5042 5053   * The smallest read we may consider to loan out an arcbuf.
5043 5054   * This must be a power of 2.
5044 5055   */
5045 5056  int zcr_blksz_min = (1 << 10);  /* 1K */
5046 5057  /*
5047 5058   * If set to less than the file block size, allow loaning out of an
5048 5059   * arcbuf for a partial block read.  This must be a power of 2.
5049 5060   */
5050 5061  int zcr_blksz_max = (1 << 17);  /* 128K */
5051 5062  
5052 5063  /*ARGSUSED*/
5053 5064  static int
5054 5065  zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5055 5066      caller_context_t *ct)
5056 5067  {
5057 5068          znode_t *zp = VTOZ(vp);
5058 5069          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5059 5070          int max_blksz = zfsvfs->z_max_blksz;
5060 5071          uio_t *uio = &xuio->xu_uio;
5061 5072          ssize_t size = uio->uio_resid;
5062 5073          offset_t offset = uio->uio_loffset;
5063 5074          int blksz;
5064 5075          int fullblk, i;
5065 5076          arc_buf_t *abuf;
5066 5077          ssize_t maxsize;
5067 5078          int preamble, postamble;
5068 5079  
5069 5080          if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5070 5081                  return (SET_ERROR(EINVAL));
5071 5082  
5072 5083          ZFS_ENTER(zfsvfs);
5073 5084          ZFS_VERIFY_ZP(zp);
5074 5085          switch (ioflag) {
5075 5086          case UIO_WRITE:
5076 5087                  /*
5077 5088                   * Loan out an arc_buf for write if write size is bigger than
5078 5089                   * max_blksz, and the file's block size is also max_blksz.
5079 5090                   */
5080 5091                  blksz = max_blksz;
5081 5092                  if (size < blksz || zp->z_blksz != blksz) {
5082 5093                          ZFS_EXIT(zfsvfs);
5083 5094                          return (SET_ERROR(EINVAL));
5084 5095                  }
5085 5096                  /*
5086 5097                   * Caller requests buffers for write before knowing where the
5087 5098                   * write offset might be (e.g. NFS TCP write).
5088 5099                   */
5089 5100                  if (offset == -1) {
5090 5101                          preamble = 0;
5091 5102                  } else {
5092 5103                          preamble = P2PHASE(offset, blksz);
5093 5104                          if (preamble) {
5094 5105                                  preamble = blksz - preamble;
5095 5106                                  size -= preamble;
5096 5107                          }
5097 5108                  }
5098 5109  
5099 5110                  postamble = P2PHASE(size, blksz);
5100 5111                  size -= postamble;
5101 5112  
5102 5113                  fullblk = size / blksz;
5103 5114                  (void) dmu_xuio_init(xuio,
5104 5115                      (preamble != 0) + fullblk + (postamble != 0));
5105 5116                  DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5106 5117                      int, postamble, int,
5107 5118                      (preamble != 0) + fullblk + (postamble != 0));
5108 5119  
5109 5120                  /*
5110 5121                   * Have to fix iov base/len for partial buffers.  They
5111 5122                   * currently represent full arc_buf's.
5112 5123                   */
5113 5124                  if (preamble) {
5114 5125                          /* data begins in the middle of the arc_buf */
5115 5126                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5116 5127                              blksz);
5117 5128                          ASSERT(abuf);
5118 5129                          (void) dmu_xuio_add(xuio, abuf,
5119 5130                              blksz - preamble, preamble);
5120 5131                  }
5121 5132  
5122 5133                  for (i = 0; i < fullblk; i++) {
5123 5134                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5124 5135                              blksz);
5125 5136                          ASSERT(abuf);
5126 5137                          (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5127 5138                  }
5128 5139  
5129 5140                  if (postamble) {
5130 5141                          /* data ends in the middle of the arc_buf */
5131 5142                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5132 5143                              blksz);
5133 5144                          ASSERT(abuf);
5134 5145                          (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5135 5146                  }
5136 5147                  break;
5137 5148          case UIO_READ:
5138 5149                  /*
5139 5150                   * Loan out an arc_buf for read if the read size is larger than
5140 5151                   * the current file block size.  Block alignment is not
5141 5152                   * considered.  Partial arc_buf will be loaned out for read.
5142 5153                   */
5143 5154                  blksz = zp->z_blksz;
5144 5155                  if (blksz < zcr_blksz_min)
5145 5156                          blksz = zcr_blksz_min;
5146 5157                  if (blksz > zcr_blksz_max)
5147 5158                          blksz = zcr_blksz_max;
5148 5159                  /* avoid potential complexity of dealing with it */
5149 5160                  if (blksz > max_blksz) {
5150 5161                          ZFS_EXIT(zfsvfs);
5151 5162                          return (SET_ERROR(EINVAL));
5152 5163                  }
5153 5164  
5154 5165                  maxsize = zp->z_size - uio->uio_loffset;
5155 5166                  if (size > maxsize)
5156 5167                          size = maxsize;
5157 5168  
5158 5169                  if (size < blksz || vn_has_cached_data(vp)) {
5159 5170                          ZFS_EXIT(zfsvfs);
5160 5171                          return (SET_ERROR(EINVAL));
5161 5172                  }
5162 5173                  break;
5163 5174          default:
5164 5175                  ZFS_EXIT(zfsvfs);
5165 5176                  return (SET_ERROR(EINVAL));
5166 5177          }
5167 5178  
5168 5179          uio->uio_extflg = UIO_XUIO;
5169 5180          XUIO_XUZC_RW(xuio) = ioflag;
5170 5181          ZFS_EXIT(zfsvfs);
5171 5182          return (0);
5172 5183  }
5173 5184  
5174 5185  /*ARGSUSED*/
5175 5186  static int
5176 5187  zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5177 5188  {
5178 5189          int i;
5179 5190          arc_buf_t *abuf;
5180 5191          int ioflag = XUIO_XUZC_RW(xuio);
5181 5192  
5182 5193          ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5183 5194  
5184 5195          i = dmu_xuio_cnt(xuio);
5185 5196          while (i-- > 0) {
5186 5197                  abuf = dmu_xuio_arcbuf(xuio, i);
5187 5198                  /*
5188 5199                   * if abuf == NULL, it must be a write buffer
5189 5200                   * that has been returned in zfs_write().
5190 5201                   */
5191 5202                  if (abuf)
5192 5203                          dmu_return_arcbuf(abuf);
5193 5204                  ASSERT(abuf || ioflag == UIO_WRITE);
5194 5205          }
5195 5206  
5196 5207          dmu_xuio_fini(xuio);
5197 5208          return (0);
5198 5209  }
5199 5210  
5200 5211  /*
5201 5212   * Predeclare these here so that the compiler assumes that
5202 5213   * this is an "old style" function declaration that does
5203 5214   * not include arguments => we won't get type mismatch errors
5204 5215   * in the initializations that follow.
5205 5216   */
5206 5217  static int zfs_inval();
5207 5218  static int zfs_isdir();
5208 5219  
5209 5220  static int
5210 5221  zfs_inval()
5211 5222  {
5212 5223          return (SET_ERROR(EINVAL));
5213 5224  }
5214 5225  
5215 5226  static int
5216 5227  zfs_isdir()
5217 5228  {
5218 5229          return (SET_ERROR(EISDIR));
5219 5230  }
5220 5231  /*
5221 5232   * Directory vnode operations template
5222 5233   */
5223 5234  vnodeops_t *zfs_dvnodeops;
5224 5235  const fs_operation_def_t zfs_dvnodeops_template[] = {
5225 5236          VOPNAME_OPEN,           { .vop_open = zfs_open },
5226 5237          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5227 5238          VOPNAME_READ,           { .error = zfs_isdir },
5228 5239          VOPNAME_WRITE,          { .error = zfs_isdir },
5229 5240          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5230 5241          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5231 5242          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5232 5243          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5233 5244          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5234 5245          VOPNAME_CREATE,         { .vop_create = zfs_create },
5235 5246          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5236 5247          VOPNAME_LINK,           { .vop_link = zfs_link },
5237 5248          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5238 5249          VOPNAME_MKDIR,          { .vop_mkdir = zfs_mkdir },
5239 5250          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5240 5251          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5241 5252          VOPNAME_SYMLINK,        { .vop_symlink = zfs_symlink },
5242 5253          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5243 5254          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5244 5255          VOPNAME_FID,            { .vop_fid = zfs_fid },
5245 5256          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5246 5257          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5247 5258          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5248 5259          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5249 5260          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5250 5261          NULL,                   NULL
5251 5262  };
5252 5263  
5253 5264  /*
5254 5265   * Regular file vnode operations template
5255 5266   */
5256 5267  vnodeops_t *zfs_fvnodeops;
5257 5268  const fs_operation_def_t zfs_fvnodeops_template[] = {
5258 5269          VOPNAME_OPEN,           { .vop_open = zfs_open },
5259 5270          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5260 5271          VOPNAME_READ,           { .vop_read = zfs_read },
5261 5272          VOPNAME_WRITE,          { .vop_write = zfs_write },
5262 5273          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5263 5274          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5264 5275          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5265 5276          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5266 5277          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5267 5278          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5268 5279          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5269 5280          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5270 5281          VOPNAME_FID,            { .vop_fid = zfs_fid },
5271 5282          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5272 5283          VOPNAME_FRLOCK,         { .vop_frlock = zfs_frlock },
5273 5284          VOPNAME_SPACE,          { .vop_space = zfs_space },
5274 5285          VOPNAME_GETPAGE,        { .vop_getpage = zfs_getpage },
5275 5286          VOPNAME_PUTPAGE,        { .vop_putpage = zfs_putpage },
5276 5287          VOPNAME_MAP,            { .vop_map = zfs_map },
5277 5288          VOPNAME_ADDMAP,         { .vop_addmap = zfs_addmap },
5278 5289          VOPNAME_DELMAP,         { .vop_delmap = zfs_delmap },
5279 5290          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5280 5291          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5281 5292          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5282 5293          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5283 5294          VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
5284 5295          VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
5285 5296          NULL,                   NULL
5286 5297  };
5287 5298  
5288 5299  /*
5289 5300   * Symbolic link vnode operations template
5290 5301   */
5291 5302  vnodeops_t *zfs_symvnodeops;
5292 5303  const fs_operation_def_t zfs_symvnodeops_template[] = {
5293 5304          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5294 5305          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5295 5306          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5296 5307          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5297 5308          VOPNAME_READLINK,       { .vop_readlink = zfs_readlink },
5298 5309          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5299 5310          VOPNAME_FID,            { .vop_fid = zfs_fid },
5300 5311          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5301 5312          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5302 5313          NULL,                   NULL
5303 5314  };
5304 5315  
5305 5316  /*
5306 5317   * special share hidden files vnode operations template
5307 5318   */
5308 5319  vnodeops_t *zfs_sharevnodeops;
5309 5320  const fs_operation_def_t zfs_sharevnodeops_template[] = {
5310 5321          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5311 5322          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5312 5323          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5313 5324          VOPNAME_FID,            { .vop_fid = zfs_fid },
5314 5325          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5315 5326          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5316 5327          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5317 5328          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5318 5329          NULL,                   NULL
5319 5330  };
5320 5331  
5321 5332  /*
5322 5333   * Extended attribute directory vnode operations template
5323 5334   *
5324 5335   * This template is identical to the directory vnodes
5325 5336   * operation template except for restricted operations:
5326 5337   *      VOP_MKDIR()
5327 5338   *      VOP_SYMLINK()
5328 5339   *
5329 5340   * Note that there are other restrictions embedded in:
5330 5341   *      zfs_create()    - restrict type to VREG
5331 5342   *      zfs_link()      - no links into/out of attribute space
5332 5343   *      zfs_rename()    - no moves into/out of attribute space
5333 5344   */
5334 5345  vnodeops_t *zfs_xdvnodeops;
5335 5346  const fs_operation_def_t zfs_xdvnodeops_template[] = {
5336 5347          VOPNAME_OPEN,           { .vop_open = zfs_open },
5337 5348          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5338 5349          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5339 5350          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5340 5351          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5341 5352          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5342 5353          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5343 5354          VOPNAME_CREATE,         { .vop_create = zfs_create },
5344 5355          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5345 5356          VOPNAME_LINK,           { .vop_link = zfs_link },
5346 5357          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5347 5358          VOPNAME_MKDIR,          { .error = zfs_inval },
5348 5359          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5349 5360          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5350 5361          VOPNAME_SYMLINK,        { .error = zfs_inval },
5351 5362          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5352 5363          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5353 5364          VOPNAME_FID,            { .vop_fid = zfs_fid },
5354 5365          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5355 5366          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5356 5367          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5357 5368          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5358 5369          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5359 5370          NULL,                   NULL
5360 5371  };
5361 5372  
5362 5373  /*
5363 5374   * Error vnode operations template
5364 5375   */
5365 5376  vnodeops_t *zfs_evnodeops;
5366 5377  const fs_operation_def_t zfs_evnodeops_template[] = {
5367 5378          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5368 5379          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5369 5380          NULL,                   NULL
5370 5381  };
  
    | 
      ↓ open down ↓ | 
    497 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX