io-lx-public-vs-joyent Wdiff usr/src/uts/common/fs/zfs/zfs_vnops.c

Print this page

Split	Close
Expand all
Collapse all

          --- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
          +++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c

   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  24   24   * Copyright 2014 Nexenta Systems, Inc.  All rights reserved.
  25   25   * Copyright (c) 2014 Integros [integros.com]
  26   26   * Copyright 2016 Joyent, Inc.
  27   27   */
  28   28  
  29   29  /* Portions Copyright 2007 Jeremy Teo */
  30   30  /* Portions Copyright 2010 Robert Milkowski */
  31   31  
  32   32  #include <sys/types.h>
  33   33  #include <sys/param.h>
  34   34  #include <sys/time.h>
  35   35  #include <sys/systm.h>
  36   36  #include <sys/sysmacros.h>
  37   37  #include <sys/resource.h>
  38   38  #include <sys/vfs.h>
  39   39  #include <sys/vfs_opreg.h>
  40   40  #include <sys/vnode.h>
  41   41  #include <sys/file.h>
  42   42  #include <sys/stat.h>
  43   43  #include <sys/kmem.h>
  44   44  #include <sys/taskq.h>
  45   45  #include <sys/uio.h>
  46   46  #include <sys/vmsystm.h>
  47   47  #include <sys/atomic.h>
  48   48  #include <sys/vm.h>
  49   49  #include <vm/seg_vn.h>
  50   50  #include <vm/pvn.h>
  51   51  #include <vm/as.h>
  52   52  #include <vm/kpm.h>
  53   53  #include <vm/seg_kpm.h>
  54   54  #include <sys/mman.h>
  55   55  #include <sys/pathname.h>
  56   56  #include <sys/cmn_err.h>
  57   57  #include <sys/errno.h>
  58   58  #include <sys/unistd.h>
  59   59  #include <sys/zfs_dir.h>
  60   60  #include <sys/zfs_acl.h>
  61   61  #include <sys/zfs_ioctl.h>
  62   62  #include <sys/fs/zfs.h>
  63   63  #include <sys/dmu.h>
  64   64  #include <sys/dmu_objset.h>
  65   65  #include <sys/spa.h>
  66   66  #include <sys/txg.h>
  67   67  #include <sys/dbuf.h>
  68   68  #include <sys/zap.h>
  69   69  #include <sys/sa.h>
  70   70  #include <sys/dirent.h>
  71   71  #include <sys/policy.h>
  72   72  #include <sys/sunddi.h>
  73   73  #include <sys/filio.h>
  74   74  #include <sys/sid.h>
  75   75  #include "fs/fs_subr.h"
  76   76  #include <sys/zfs_ctldir.h>
  77   77  #include <sys/zfs_fuid.h>
  78   78  #include <sys/zfs_sa.h>
  79   79  #include <sys/dnlc.h>
  80   80  #include <sys/zfs_rlock.h>
  81   81  #include <sys/extdirent.h>
  82   82  #include <sys/kidmap.h>
  83   83  #include <sys/cred.h>
  84   84  #include <sys/attr.h>
  85   85  
  86   86  /*
  87   87   * Programming rules.
  88   88   *
  89   89   * Each vnode op performs some logical unit of work.  To do this, the ZPL must
  90   90   * properly lock its in-core state, create a DMU transaction, do the work,
  91   91   * record this work in the intent log (ZIL), commit the DMU transaction,
  92   92   * and wait for the intent log to commit if it is a synchronous operation.
  93   93   * Moreover, the vnode ops must work in both normal and log replay context.
  94   94   * The ordering of events is important to avoid deadlocks and references
  95   95   * to freed memory.  The example below illustrates the following Big Rules:
  96   96   *
  97   97   *  (1) A check must be made in each zfs thread for a mounted file system.
  98   98   *      This is done avoiding races using ZFS_ENTER(zfsvfs).
  99   99   *      A ZFS_EXIT(zfsvfs) is needed before all returns.  Any znodes
 100  100   *      must be checked with ZFS_VERIFY_ZP(zp).  Both of these macros
 101  101   *      can return EIO from the calling function.
 102  102   *
 103  103   *  (2) VN_RELE() should always be the last thing except for zil_commit()
 104  104   *      (if necessary) and ZFS_EXIT(). This is for 3 reasons:
 105  105   *      First, if it's the last reference, the vnode/znode
 106  106   *      can be freed, so the zp may point to freed memory.  Second, the last
 107  107   *      reference will call zfs_zinactive(), which may induce a lot of work --
 108  108   *      pushing cached pages (which acquires range locks) and syncing out
 109  109   *      cached atime changes.  Third, zfs_zinactive() may require a new tx,
 110  110   *      which could deadlock the system if you were already holding one.
 111  111   *      If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
 112  112   *
 113  113   *  (3) All range locks must be grabbed before calling dmu_tx_assign(),
 114  114   *      as they can span dmu_tx_assign() calls.
 115  115   *
 116  116   *  (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
 117  117   *      dmu_tx_assign().  This is critical because we don't want to block
 118  118   *      while holding locks.
 119  119   *
 120  120   *      If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT.  This
 121  121   *      reduces lock contention and CPU usage when we must wait (note that if
 122  122   *      throughput is constrained by the storage, nearly every transaction
 123  123   *      must wait).
 124  124   *
 125  125   *      Note, in particular, that if a lock is sometimes acquired before
 126  126   *      the tx assigns, and sometimes after (e.g. z_lock), then failing
 127  127   *      to use a non-blocking assign can deadlock the system.  The scenario:
 128  128   *
 129  129   *      Thread A has grabbed a lock before calling dmu_tx_assign().
 130  130   *      Thread B is in an already-assigned tx, and blocks for this lock.
 131  131   *      Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
 132  132   *      forever, because the previous txg can't quiesce until B's tx commits.
 133  133   *
 134  134   *      If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
 135  135   *      then drop all locks, call dmu_tx_wait(), and try again.  On subsequent
 136  136   *      calls to dmu_tx_assign(), pass TXG_WAITED rather than TXG_NOWAIT,
 137  137   *      to indicate that this operation has already called dmu_tx_wait().
 138  138   *      This will ensure that we don't retry forever, waiting a short bit
 139  139   *      each time.
 140  140   *
 141  141   *  (5) If the operation succeeded, generate the intent log entry for it
 142  142   *      before dropping locks.  This ensures that the ordering of events
 143  143   *      in the intent log matches the order in which they actually occurred.
 144  144   *      During ZIL replay the zfs_log_* functions will update the sequence
 145  145   *      number to indicate the zil transaction has replayed.
 146  146   *
 147  147   *  (6) At the end of each vnode op, the DMU tx must always commit,
 148  148   *      regardless of whether there were any errors.
 149  149   *
 150  150   *  (7) After dropping all locks, invoke zil_commit(zilog, foid)
 151  151   *      to ensure that synchronous semantics are provided when necessary.
 152  152   *
 153  153   * In general, this is how things should be ordered in each vnode op:
 154  154   *
 155  155   *      ZFS_ENTER(zfsvfs);              // exit if unmounted
 156  156   * top:
 157  157   *      zfs_dirent_lock(&dl, ...)       // lock directory entry (may VN_HOLD())
 158  158   *      rw_enter(...);                  // grab any other locks you need
 159  159   *      tx = dmu_tx_create(...);        // get DMU tx
 160  160   *      dmu_tx_hold_*();                // hold each object you might modify
 161  161   *      error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
 162  162   *      if (error) {
 163  163   *              rw_exit(...);           // drop locks
 164  164   *              zfs_dirent_unlock(dl);  // unlock directory entry
 165  165   *              VN_RELE(...);           // release held vnodes
 166  166   *              if (error == ERESTART) {
 167  167   *                      waited = B_TRUE;
 168  168   *                      dmu_tx_wait(tx);
 169  169   *                      dmu_tx_abort(tx);
 170  170   *                      goto top;
 171  171   *              }
 172  172   *              dmu_tx_abort(tx);       // abort DMU tx
 173  173   *              ZFS_EXIT(zfsvfs);       // finished in zfs
 174  174   *              return (error);         // really out of space
 175  175   *      }
 176  176   *      error = do_real_work();         // do whatever this VOP does
 177  177   *      if (error == 0)
 178  178   *              zfs_log_*(...);         // on success, make ZIL entry
 179  179   *      dmu_tx_commit(tx);              // commit DMU tx -- error or not
 180  180   *      rw_exit(...);                   // drop locks
 181  181   *      zfs_dirent_unlock(dl);          // unlock directory entry
 182  182   *      VN_RELE(...);                   // release held vnodes
 183  183   *      zil_commit(zilog, foid);        // synchronous when necessary
 184  184   *      ZFS_EXIT(zfsvfs);               // finished in zfs
 185  185   *      return (error);                 // done, report error
 186  186   */
 187  187  
 188  188  /* ARGSUSED */
 189  189  static int
 190  190  zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
 191  191  {
 192  192          znode_t *zp = VTOZ(*vpp);
 193  193          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 194  194  
 195  195          ZFS_ENTER(zfsvfs);
 196  196          ZFS_VERIFY_ZP(zp);
 197  197  
 198  198          if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
 199  199              ((flag & FAPPEND) == 0)) {
 200  200                  ZFS_EXIT(zfsvfs);
 201  201                  return (SET_ERROR(EPERM));
 202  202          }
 203  203  
 204  204          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 205  205              ZTOV(zp)->v_type == VREG &&
 206  206              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
 207  207                  if (fs_vscan(*vpp, cr, 0) != 0) {
 208  208                          ZFS_EXIT(zfsvfs);
 209  209                          return (SET_ERROR(EACCES));
 210  210                  }
 211  211          }
 212  212  
 213  213          /* Keep a count of the synchronous opens in the znode */
 214  214          if (flag & (FSYNC | FDSYNC))
 215  215                  atomic_inc_32(&zp->z_sync_cnt);
 216  216  
 217  217          ZFS_EXIT(zfsvfs);
 218  218          return (0);
 219  219  }
 220  220  
 221  221  /* ARGSUSED */
 222  222  static int
 223  223  zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 224  224      caller_context_t *ct)
 225  225  {
 226  226          znode_t *zp = VTOZ(vp);
 227  227          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 228  228  
 229  229          /*
 230  230           * Clean up any locks held by this process on the vp.
 231  231           */
 232  232          cleanlocks(vp, ddi_get_pid(), 0);
 233  233          cleanshares(vp, ddi_get_pid());
 234  234  
 235  235          ZFS_ENTER(zfsvfs);
 236  236          ZFS_VERIFY_ZP(zp);
 237  237  
 238  238          /* Decrement the synchronous opens in the znode */
 239  239          if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 240  240                  atomic_dec_32(&zp->z_sync_cnt);
 241  241  
 242  242          if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 243  243              ZTOV(zp)->v_type == VREG &&
 244  244              !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
 245  245                  VERIFY(fs_vscan(vp, cr, 1) == 0);
 246  246  
 247  247          ZFS_EXIT(zfsvfs);
 248  248          return (0);
 249  249  }
 250  250  
 251  251  /*
 252  252   * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
 253  253   * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
 254  254   */
 255  255  static int
 256  256  zfs_holey(vnode_t *vp, int cmd, offset_t *off)
 257  257  {
 258  258          znode_t *zp = VTOZ(vp);
 259  259          uint64_t noff = (uint64_t)*off; /* new offset */
 260  260          uint64_t file_sz;
 261  261          int error;
 262  262          boolean_t hole;
 263  263  
 264  264          file_sz = zp->z_size;
 265  265          if (noff >= file_sz)  {
 266  266                  return (SET_ERROR(ENXIO));
 267  267          }
 268  268  
 269  269          if (cmd == _FIO_SEEK_HOLE)
 270  270                  hole = B_TRUE;
 271  271          else
 272  272                  hole = B_FALSE;
 273  273  
 274  274          error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
 275  275  
 276  276          if (error == ESRCH)
 277  277                  return (SET_ERROR(ENXIO));
 278  278  
 279  279          /*
 280  280           * We could find a hole that begins after the logical end-of-file,
 281  281           * because dmu_offset_next() only works on whole blocks.  If the
 282  282           * EOF falls mid-block, then indicate that the "virtual hole"
 283  283           * at the end of the file begins at the logical EOF, rather than
 284  284           * at the end of the last block.
 285  285           */
 286  286          if (noff > file_sz) {
 287  287                  ASSERT(hole);
 288  288                  noff = file_sz;
 289  289          }
 290  290  
 291  291          if (noff < *off)
 292  292                  return (error);
 293  293          *off = noff;
 294  294          return (error);
 295  295  }
 296  296  
 297  297  /* ARGSUSED */
 298  298  static int
 299  299  zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
 300  300      int *rvalp, caller_context_t *ct)
 301  301  {
 302  302          offset_t off;
 303  303          offset_t ndata;
 304  304          dmu_object_info_t doi;
 305  305          int error;
 306  306          zfsvfs_t *zfsvfs;
 307  307          znode_t *zp;
 308  308  
 309  309          switch (com) {
 310  310          case _FIOFFS:
 311  311          {
 312  312                  return (zfs_sync(vp->v_vfsp, 0, cred));
 313  313  
 314  314                  /*
 315  315                   * The following two ioctls are used by bfu.  Faking out,
 316  316                   * necessary to avoid bfu errors.
 317  317                   */
 318  318          }
 319  319          case _FIOGDIO:
 320  320          case _FIOSDIO:
 321  321          {
 322  322                  return (0);
 323  323          }
 324  324  
 325  325          case _FIO_SEEK_DATA:
 326  326          case _FIO_SEEK_HOLE:
 327  327          {
 328  328                  if (ddi_copyin((void *)data, &off, sizeof (off), flag))
 329  329                          return (SET_ERROR(EFAULT));
 330  330  
 331  331                  zp = VTOZ(vp);
 332  332                  zfsvfs = zp->z_zfsvfs;
 333  333                  ZFS_ENTER(zfsvfs);
 334  334                  ZFS_VERIFY_ZP(zp);
 335  335  
 336  336                  /* offset parameter is in/out */
 337  337                  error = zfs_holey(vp, com, &off);
 338  338                  ZFS_EXIT(zfsvfs);
 339  339                  if (error)
 340  340                          return (error);
 341  341                  if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
 342  342                          return (SET_ERROR(EFAULT));
 343  343                  return (0);
 344  344          }
 345  345          case _FIO_COUNT_FILLED:
 346  346          {
 347  347                  /*
 348  348                   * _FIO_COUNT_FILLED adds a new ioctl command which
 349  349                   * exposes the number of filled blocks in a
 350  350                   * ZFS object.
 351  351                   */
 352  352                  zp = VTOZ(vp);
 353  353                  zfsvfs = zp->z_zfsvfs;
 354  354                  ZFS_ENTER(zfsvfs);
 355  355                  ZFS_VERIFY_ZP(zp);
 356  356  
 357  357                  /*
 358  358                   * Wait for all dirty blocks for this object
 359  359                   * to get synced out to disk, and the DMU info
 360  360                   * updated.
 361  361                   */
 362  362                  error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
 363  363                  if (error) {
 364  364                          ZFS_EXIT(zfsvfs);
 365  365                          return (error);
 366  366                  }
 367  367  
 368  368                  /*
 369  369                   * Retrieve fill count from DMU object.
 370  370                   */
 371  371                  error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
 372  372                  if (error) {
 373  373                          ZFS_EXIT(zfsvfs);
 374  374                          return (error);
 375  375                  }
 376  376  
 377  377                  ndata = doi.doi_fill_count;
 378  378  
 379  379                  ZFS_EXIT(zfsvfs);
 380  380                  if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
 381  381                          return (SET_ERROR(EFAULT));
 382  382                  return (0);
 383  383          }
 384  384          }
 385  385          return (SET_ERROR(ENOTTY));
 386  386  }
 387  387  
 388  388  /*
 389  389   * Utility functions to map and unmap a single physical page.  These
 390  390   * are used to manage the mappable copies of ZFS file data, and therefore
 391  391   * do not update ref/mod bits.
 392  392   */
 393  393  caddr_t
 394  394  zfs_map_page(page_t *pp, enum seg_rw rw)
 395  395  {
 396  396          if (kpm_enable)
 397  397                  return (hat_kpm_mapin(pp, 0));
 398  398          ASSERT(rw == S_READ || rw == S_WRITE);
 399  399          return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
 400  400              (caddr_t)-1));
 401  401  }
 402  402  
 403  403  void
 404  404  zfs_unmap_page(page_t *pp, caddr_t addr)
 405  405  {
 406  406          if (kpm_enable) {
 407  407                  hat_kpm_mapout(pp, 0, addr);
 408  408          } else {
 409  409                  ppmapout(addr);
 410  410          }
 411  411  }
 412  412  
 413  413  /*
 414  414   * When a file is memory mapped, we must keep the IO data synchronized
 415  415   * between the DMU cache and the memory mapped pages.  What this means:
 416  416   *
 417  417   * On Write:    If we find a memory mapped page, we write to *both*
 418  418   *              the page and the dmu buffer.
 419  419   */
 420  420  static void
 421  421  update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 422  422  {
 423  423          int64_t off;
 424  424  
 425  425          off = start & PAGEOFFSET;
 426  426          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 427  427                  page_t *pp;
 428  428                  uint64_t nbytes = MIN(PAGESIZE - off, len);
 429  429  
 430  430                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 431  431                          caddr_t va;
 432  432  
 433  433                          va = zfs_map_page(pp, S_WRITE);
 434  434                          (void) dmu_read(os, oid, start+off, nbytes, va+off,
 435  435                              DMU_READ_PREFETCH);
 436  436                          zfs_unmap_page(pp, va);
 437  437                          page_unlock(pp);
 438  438                  }
 439  439                  len -= nbytes;
 440  440                  off = 0;
 441  441          }
 442  442  }
 443  443  
 444  444  /*
 445  445   * When a file is memory mapped, we must keep the IO data synchronized
 446  446   * between the DMU cache and the memory mapped pages.  What this means:
 447  447   *
 448  448   * On Read:     We "read" preferentially from memory mapped pages,
 449  449   *              else we default from the dmu buffer.
 450  450   *
 451  451   * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
 452  452   *       the file is memory mapped.
 453  453   */
 454  454  static int
 455  455  mappedread(vnode_t *vp, int nbytes, uio_t *uio)
 456  456  {
 457  457          znode_t *zp = VTOZ(vp);
 458  458          int64_t start, off;
 459  459          int len = nbytes;
 460  460          int error = 0;
 461  461  
 462  462          start = uio->uio_loffset;
 463  463          off = start & PAGEOFFSET;
 464  464          for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 465  465                  page_t *pp;
 466  466                  uint64_t bytes = MIN(PAGESIZE - off, len);
 467  467  
 468  468                  if (pp = page_lookup(vp, start, SE_SHARED)) {
 469  469                          caddr_t va;
 470  470  
 471  471                          va = zfs_map_page(pp, S_READ);
 472  472                          error = uiomove(va + off, bytes, UIO_READ, uio);
 473  473                          zfs_unmap_page(pp, va);
 474  474                          page_unlock(pp);
 475  475                  } else {
 476  476                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 477  477                              uio, bytes);
 478  478                  }
 479  479                  len -= bytes;
 480  480                  off = 0;
 481  481                  if (error)
 482  482                          break;
 483  483          }
 484  484          return (error);
 485  485  }
 486  486  
 487  487  offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
 488  488  
 489  489  /*
 490  490   * Read bytes from specified file into supplied buffer.
 491  491   *
 492  492   *      IN:     vp      - vnode of file to be read from.
 493  493   *              uio     - structure supplying read location, range info,
 494  494   *                        and return buffer.
 495  495   *              ioflag  - SYNC flags; used to provide FRSYNC semantics.
 496  496   *              cr      - credentials of caller.
 497  497   *              ct      - caller context
 498  498   *
 499  499   *      OUT:    uio     - updated offset and range, buffer filled.
 500  500   *
 501  501   *      RETURN: 0 on success, error code on failure.
 502  502   *
 503  503   * Side Effects:
 504  504   *      vp - atime updated if byte count > 0
 505  505   */
 506  506  /* ARGSUSED */
 507  507  static int
 508  508  zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 509  509  {
 510  510          znode_t         *zp = VTOZ(vp);
 511  511          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 512  512          ssize_t         n, nbytes;
 513  513          int             error = 0;
 514  514          rl_t            *rl;
 515  515          xuio_t          *xuio = NULL;
 516  516  
 517  517          ZFS_ENTER(zfsvfs);
 518  518          ZFS_VERIFY_ZP(zp);
 519  519  
 520  520          if (zp->z_pflags & ZFS_AV_QUARANTINED) {
 521  521                  ZFS_EXIT(zfsvfs);
 522  522                  return (SET_ERROR(EACCES));
 523  523          }
 524  524  
 525  525          /*
 526  526           * Validate file offset
 527  527           */
 528  528          if (uio->uio_loffset < (offset_t)0) {
 529  529                  ZFS_EXIT(zfsvfs);
 530  530                  return (SET_ERROR(EINVAL));
 531  531          }
 532  532  
 533  533          /*
 534  534           * Fasttrack empty reads
 535  535           */
 536  536          if (uio->uio_resid == 0) {
 537  537                  ZFS_EXIT(zfsvfs);
 538  538                  return (0);
 539  539          }
 540  540  
 541  541          /*
 542  542           * Check for mandatory locks
 543  543           */
 544  544          if (MANDMODE(zp->z_mode)) {
 545  545                  if (error = chklock(vp, FREAD,
 546  546                      uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
 547  547                          ZFS_EXIT(zfsvfs);
 548  548                          return (error);
 549  549                  }
 550  550          }
 551  551  
 552  552          /*
 553  553           * If we're in FRSYNC mode, sync out this znode before reading it.
 554  554           */
 555  555          if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
 556  556                  zil_commit(zfsvfs->z_log, zp->z_id);
 557  557  
 558  558          /*
 559  559           * Lock the range against changes.
 560  560           */
 561  561          rl = zfs_range_lock(zp, uio->uio_loffset, uio->uio_resid, RL_READER);
 562  562  
 563  563          /*
 564  564           * If we are reading past end-of-file we can skip
 565  565           * to the end; but we might still need to set atime.
 566  566           */
 567  567          if (uio->uio_loffset >= zp->z_size) {
 568  568                  error = 0;
 569  569                  goto out;
 570  570          }
 571  571  
 572  572          ASSERT(uio->uio_loffset < zp->z_size);
 573  573          n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
 574  574  
 575  575          if ((uio->uio_extflg == UIO_XUIO) &&
 576  576              (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
 577  577                  int nblk;
 578  578                  int blksz = zp->z_blksz;
 579  579                  uint64_t offset = uio->uio_loffset;
 580  580  
 581  581                  xuio = (xuio_t *)uio;
 582  582                  if ((ISP2(blksz))) {
 583  583                          nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
 584  584                              blksz)) / blksz;
 585  585                  } else {
 586  586                          ASSERT(offset + n <= blksz);
 587  587                          nblk = 1;
 588  588                  }
 589  589                  (void) dmu_xuio_init(xuio, nblk);
 590  590  
 591  591                  if (vn_has_cached_data(vp)) {
 592  592                          /*
 593  593                           * For simplicity, we always allocate a full buffer
 594  594                           * even if we only expect to read a portion of a block.
 595  595                           */
 596  596                          while (--nblk >= 0) {
 597  597                                  (void) dmu_xuio_add(xuio,
 598  598                                      dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 599  599                                      blksz), 0, blksz);
 600  600                          }
 601  601                  }
 602  602          }
 603  603  
 604  604          while (n > 0) {
 605  605                  nbytes = MIN(n, zfs_read_chunk_size -
 606  606                      P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
 607  607  
 608  608                  if (vn_has_cached_data(vp)) {
 609  609                          error = mappedread(vp, nbytes, uio);
 610  610                  } else {
 611  611                          error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 612  612                              uio, nbytes);
 613  613                  }
 614  614                  if (error) {
 615  615                          /* convert checksum errors into IO errors */
 616  616                          if (error == ECKSUM)
 617  617                                  error = SET_ERROR(EIO);
 618  618                          break;
 619  619                  }
 620  620  
 621  621                  n -= nbytes;
 622  622          }
 623  623  out:
 624  624          zfs_range_unlock(rl);
 625  625  
 626  626          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 627  627          ZFS_EXIT(zfsvfs);
 628  628          return (error);
 629  629  }
 630  630  
 631  631  /*
 632  632   * Write the bytes to a file.
 633  633   *
 634  634   *      IN:     vp      - vnode of file to be written to.
 635  635   *              uio     - structure supplying write location, range info,
 636  636   *                        and data buffer.
 637  637   *              ioflag  - FAPPEND, FSYNC, and/or FDSYNC.  FAPPEND is
 638  638   *                        set if in append mode.
 639  639   *              cr      - credentials of caller.
 640  640   *              ct      - caller context (NFS/CIFS fem monitor only)
 641  641   *
 642  642   *      OUT:    uio     - updated offset and range.
 643  643   *
 644  644   *      RETURN: 0 on success, error code on failure.
 645  645   *
 646  646   * Timestamps:
 647  647   *      vp - ctime|mtime updated if byte count > 0
 648  648   */
 649  649  
 650  650  /* ARGSUSED */
 651  651  static int
 652  652  zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 653  653  {
 654  654          znode_t         *zp = VTOZ(vp);
 655  655          rlim64_t        limit = uio->uio_llimit;
 656  656          ssize_t         start_resid = uio->uio_resid;
 657  657          ssize_t         tx_bytes;
 658  658          uint64_t        end_size;
 659  659          dmu_tx_t        *tx;
 660  660          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 661  661          zilog_t         *zilog;
 662  662          offset_t        woff;
 663  663          ssize_t         n, nbytes;
 664  664          rl_t            *rl;
 665  665          int             max_blksz = zfsvfs->z_max_blksz;
 666  666          int             error = 0;
 667  667          int             prev_error;
 668  668          arc_buf_t       *abuf;
 669  669          iovec_t         *aiov = NULL;
 670  670          xuio_t          *xuio = NULL;
 671  671          int             i_iov = 0;
 672  672          int             iovcnt = uio->uio_iovcnt;
 673  673          iovec_t         *iovp = uio->uio_iov;
 674  674          int             write_eof;
 675  675          int             count = 0;
 676  676          sa_bulk_attr_t  bulk[4];
 677  677          uint64_t        mtime[2], ctime[2];
 678  678

↓ open down ↓

678 lines elided

↑ open up ↑

 679  679          /*
 680  680           * Fasttrack empty write
 681  681           */
 682  682          n = start_resid;
 683  683          if (n == 0)
 684  684                  return (0);
 685  685  
 686  686          if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 687  687                  limit = MAXOFFSET_T;
 688  688  
 689      -        /*
 690      -         * Pre-fault the pages to ensure slow (eg NFS) pages
 691      -         * don't hold up txg.
 692      -         * Skip this if uio contains loaned arc_buf.
 693      -         */
 694      -        if ((uio->uio_extflg == UIO_XUIO) &&
 695      -            (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
 696      -                xuio = (xuio_t *)uio;
 697      -        else
 698      -                uio_prefaultpages(n, uio);
 699      -
 700  689          ZFS_ENTER(zfsvfs);
 701  690          ZFS_VERIFY_ZP(zp);
 702  691  
 703  692          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
 704  693          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
 705  694          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 706  695              &zp->z_size, 8);
 707  696          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 708  697              &zp->z_pflags, 8);
 709  698

 710  699          /*
 711  700           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
 712  701           * callers might not be able to detect properly that we are read-only,
 713  702           * so check it explicitly here.
 714  703           */
 715  704          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 716  705                  ZFS_EXIT(zfsvfs);
 717  706                  return (SET_ERROR(EROFS));
 718  707          }
 719  708  
 720  709          /*
 721  710           * If immutable or not appending then return EPERM
 722  711           */
 723  712          if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) ||
 724  713              ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
 725  714              (uio->uio_loffset < zp->z_size))) {
 726  715                  ZFS_EXIT(zfsvfs);
 727  716                  return (SET_ERROR(EPERM));
 728  717          }
 729  718  
 730  719          zilog = zfsvfs->z_log;
 731  720  
 732  721          /*
 733  722           * Validate file offset
 734  723           */
 735  724          woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
 736  725          if (woff < 0) {
 737  726                  ZFS_EXIT(zfsvfs);
 738  727                  return (SET_ERROR(EINVAL));
 739  728          }
 740  729  
 741  730          /*

↓ open down ↓

32 lines elided

↑ open up ↑

 742  731           * Check for mandatory locks before calling zfs_range_lock()
 743  732           * in order to prevent a deadlock with locks set via fcntl().
 744  733           */
 745  734          if (MANDMODE((mode_t)zp->z_mode) &&
 746  735              (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
 747  736                  ZFS_EXIT(zfsvfs);
 748  737                  return (error);
 749  738          }
 750  739  
 751  740          /*
      741 +         * Pre-fault the pages to ensure slow (eg NFS) pages
      742 +         * don't hold up txg.
      743 +         * Skip this if uio contains loaned arc_buf.
      744 +         */
      745 +        if ((uio->uio_extflg == UIO_XUIO) &&
      746 +            (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
      747 +                xuio = (xuio_t *)uio;
      748 +        else
      749 +                uio_prefaultpages(MIN(n, max_blksz), uio);
      750 +
      751 +        /*
 752  752           * If in append mode, set the io offset pointer to eof.
 753  753           */
 754  754          if (ioflag & FAPPEND) {
 755  755                  /*
 756  756                   * Obtain an appending range lock to guarantee file append
 757  757                   * semantics.  We reset the write offset once we have the lock.
 758  758                   */
 759  759                  rl = zfs_range_lock(zp, 0, n, RL_APPEND);
 760  760                  woff = rl->r_off;
 761  761                  if (rl->r_len == UINT64_MAX) {

 762  762                          /*
 763  763                           * We overlocked the file because this write will cause
 764  764                           * the file block size to increase.
 765  765                           * Note that zp_size cannot change with this lock held.
 766  766                           */
 767  767                          woff = zp->z_size;
 768  768                  }
 769  769                  uio->uio_loffset = woff;
 770  770          } else {
 771  771                  /*
 772  772                   * Note that if the file block size will change as a result of
 773  773                   * this write, then this range lock will lock the entire file
 774  774                   * so that we can re-write the block safely.
 775  775                   */
 776  776                  rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 777  777          }
 778  778  
 779  779          if (woff >= limit) {
 780  780                  zfs_range_unlock(rl);
 781  781                  ZFS_EXIT(zfsvfs);
 782  782                  return (SET_ERROR(EFBIG));
 783  783          }
 784  784  
 785  785          if ((woff + n) > limit || woff > (limit - n))
 786  786                  n = limit - woff;
 787  787  
 788  788          /* Will this write extend the file length? */
 789  789          write_eof = (woff + n > zp->z_size);
 790  790  
 791  791          end_size = MAX(zp->z_size, woff + n);
 792  792  
 793  793          /*
 794  794           * Write the file in reasonable size chunks.  Each chunk is written
 795  795           * in a separate transaction; this keeps the intent log records small
 796  796           * and allows us to do more fine-grained space accounting.
 797  797           */
 798  798          while (n > 0) {
 799  799                  abuf = NULL;
 800  800                  woff = uio->uio_loffset;
 801  801                  if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
 802  802                      zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
 803  803                          if (abuf != NULL)
 804  804                                  dmu_return_arcbuf(abuf);
 805  805                          error = SET_ERROR(EDQUOT);
 806  806                          break;
 807  807                  }
 808  808  
 809  809                  if (xuio && abuf == NULL) {
 810  810                          ASSERT(i_iov < iovcnt);
 811  811                          aiov = &iovp[i_iov];
 812  812                          abuf = dmu_xuio_arcbuf(xuio, i_iov);
 813  813                          dmu_xuio_clear(xuio, i_iov);
 814  814                          DTRACE_PROBE3(zfs_cp_write, int, i_iov,
 815  815                              iovec_t *, aiov, arc_buf_t *, abuf);
 816  816                          ASSERT((aiov->iov_base == abuf->b_data) ||
 817  817                              ((char *)aiov->iov_base - (char *)abuf->b_data +
 818  818                              aiov->iov_len == arc_buf_size(abuf)));
 819  819                          i_iov++;
 820  820                  } else if (abuf == NULL && n >= max_blksz &&
 821  821                      woff >= zp->z_size &&
 822  822                      P2PHASE(woff, max_blksz) == 0 &&
 823  823                      zp->z_blksz == max_blksz) {
 824  824                          /*
 825  825                           * This write covers a full block.  "Borrow" a buffer
 826  826                           * from the dmu so that we can fill it before we enter
 827  827                           * a transaction.  This avoids the possibility of
 828  828                           * holding up the transaction if the data copy hangs
 829  829                           * up on a pagefault (e.g., from an NFS server mapping).
 830  830                           */
 831  831                          size_t cbytes;
 832  832  
 833  833                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
 834  834                              max_blksz);
 835  835                          ASSERT(abuf != NULL);
 836  836                          ASSERT(arc_buf_size(abuf) == max_blksz);
 837  837                          if (error = uiocopy(abuf->b_data, max_blksz,
 838  838                              UIO_WRITE, uio, &cbytes)) {
 839  839                                  dmu_return_arcbuf(abuf);
 840  840                                  break;
 841  841                          }
 842  842                          ASSERT(cbytes == max_blksz);
 843  843                  }
 844  844  
 845  845                  /*
 846  846                   * Start a transaction.
 847  847                   */
 848  848                  tx = dmu_tx_create(zfsvfs->z_os);
 849  849                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
 850  850                  dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
 851  851                  zfs_sa_upgrade_txholds(tx, zp);
 852  852                  error = dmu_tx_assign(tx, TXG_WAIT);
 853  853                  if (error) {
 854  854                          dmu_tx_abort(tx);
 855  855                          if (abuf != NULL)
 856  856                                  dmu_return_arcbuf(abuf);
 857  857                          break;
 858  858                  }
 859  859  
 860  860                  /*
 861  861                   * If zfs_range_lock() over-locked we grow the blocksize
 862  862                   * and then reduce the lock range.  This will only happen
 863  863                   * on the first iteration since zfs_range_reduce() will
 864  864                   * shrink down r_len to the appropriate size.
 865  865                   */
 866  866                  if (rl->r_len == UINT64_MAX) {
 867  867                          uint64_t new_blksz;
 868  868  
 869  869                          if (zp->z_blksz > max_blksz) {
 870  870                                  /*
 871  871                                   * File's blocksize is already larger than the
 872  872                                   * "recordsize" property.  Only let it grow to
 873  873                                   * the next power of 2.
 874  874                                   */
 875  875                                  ASSERT(!ISP2(zp->z_blksz));
 876  876                                  new_blksz = MIN(end_size,
 877  877                                      1 << highbit64(zp->z_blksz));
 878  878                          } else {
 879  879                                  new_blksz = MIN(end_size, max_blksz);
 880  880                          }
 881  881                          zfs_grow_blocksize(zp, new_blksz, tx);
 882  882                          zfs_range_reduce(rl, woff, n);
 883  883                  }
 884  884  
 885  885                  /*
 886  886                   * XXX - should we really limit each write to z_max_blksz?
 887  887                   * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 888  888                   */
 889  889                  nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
 890  890  
 891  891                  if (abuf == NULL) {
 892  892                          tx_bytes = uio->uio_resid;
 893  893                          error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
 894  894                              uio, nbytes, tx);
 895  895                          tx_bytes -= uio->uio_resid;
 896  896                  } else {
 897  897                          tx_bytes = nbytes;
 898  898                          ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
 899  899                          /*
 900  900                           * If this is not a full block write, but we are
 901  901                           * extending the file past EOF and this data starts
 902  902                           * block-aligned, use assign_arcbuf().  Otherwise,
 903  903                           * write via dmu_write().
 904  904                           */
 905  905                          if (tx_bytes < max_blksz && (!write_eof ||
 906  906                              aiov->iov_base != abuf->b_data)) {
 907  907                                  ASSERT(xuio);
 908  908                                  dmu_write(zfsvfs->z_os, zp->z_id, woff,
 909  909                                      aiov->iov_len, aiov->iov_base, tx);
 910  910                                  dmu_return_arcbuf(abuf);
 911  911                                  xuio_stat_wbuf_copied();
 912  912                          } else {
 913  913                                  ASSERT(xuio || tx_bytes == max_blksz);
 914  914                                  dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
 915  915                                      woff, abuf, tx);
 916  916                          }
 917  917                          ASSERT(tx_bytes <= uio->uio_resid);
 918  918                          uioskip(uio, tx_bytes);
 919  919                  }
 920  920                  if (tx_bytes && vn_has_cached_data(vp)) {
 921  921                          update_pages(vp, woff,
 922  922                              tx_bytes, zfsvfs->z_os, zp->z_id);
 923  923                  }
 924  924  
 925  925                  /*
 926  926                   * If we made no progress, we're done.  If we made even
 927  927                   * partial progress, update the znode and ZIL accordingly.
 928  928                   */
 929  929                  if (tx_bytes == 0) {
 930  930                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
 931  931                              (void *)&zp->z_size, sizeof (uint64_t), tx);
 932  932                          dmu_tx_commit(tx);
 933  933                          ASSERT(error != 0);
 934  934                          break;
 935  935                  }
 936  936  
 937  937                  /*
 938  938                   * Clear Set-UID/Set-GID bits on successful write if not
 939  939                   * privileged and at least one of the excute bits is set.
 940  940                   *
 941  941                   * It would be nice to to this after all writes have
 942  942                   * been done, but that would still expose the ISUID/ISGID
 943  943                   * to another app after the partial write is committed.
 944  944                   *
 945  945                   * Note: we don't call zfs_fuid_map_id() here because
 946  946                   * user 0 is not an ephemeral uid.
 947  947                   */
 948  948                  mutex_enter(&zp->z_acl_lock);
 949  949                  if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
 950  950                      (S_IXUSR >> 6))) != 0 &&
 951  951                      (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
 952  952                      secpolicy_vnode_setid_retain(cr,
 953  953                      (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 954  954                          uint64_t newmode;
 955  955                          zp->z_mode &= ~(S_ISUID | S_ISGID);
 956  956                          newmode = zp->z_mode;
 957  957                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 958  958                              (void *)&newmode, sizeof (uint64_t), tx);
 959  959                  }
 960  960                  mutex_exit(&zp->z_acl_lock);
 961  961  
 962  962                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 963  963                      B_TRUE);
 964  964  
 965  965                  /*
 966  966                   * Update the file size (zp_size) if it has changed;
 967  967                   * account for possible concurrent updates.
 968  968                   */
 969  969                  while ((end_size = zp->z_size) < uio->uio_loffset) {
 970  970                          (void) atomic_cas_64(&zp->z_size, end_size,
 971  971                              uio->uio_loffset);
 972  972                  }
 973  973                  /*
 974  974                   * If we are replaying and eof is non zero then force
 975  975                   * the file size to the specified eof. Note, there's no
 976  976                   * concurrency during replay.
 977  977                   */
 978  978                  if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 979  979                          zp->z_size = zfsvfs->z_replay_eof;
 980  980  
 981  981                  /*
 982  982                   * Keep track of a possible pre-existing error from a partial
 983  983                   * write via dmu_write_uio_dbuf above.
 984  984                   */

↓ open down ↓

223 lines elided

↑ open up ↑

 985  985                  prev_error = error;
 986  986                  error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 987  987  
 988  988                  zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 989  989                  dmu_tx_commit(tx);
 990  990  
 991  991                  if (prev_error != 0 || error != 0)
 992  992                          break;
 993  993                  ASSERT(tx_bytes == nbytes);
 994  994                  n -= nbytes;
      995 +
      996 +                if (!xuio && n > 0)
      997 +                        uio_prefaultpages(MIN(n, max_blksz), uio);
 995  998          }
 996  999  
 997 1000          zfs_range_unlock(rl);
 998 1001  
 999 1002          /*
1000 1003           * If we're in replay mode, or we made no progress, return error.
1001 1004           * Otherwise, it's at least a partial write, so it's successful.
1002 1005           */
1003 1006          if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1004 1007                  ZFS_EXIT(zfsvfs);

1005 1008                  return (error);
1006 1009          }
1007 1010  
1008 1011          if (ioflag & (FSYNC | FDSYNC) ||
1009 1012              zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1010 1013                  zil_commit(zilog, zp->z_id);
1011 1014  
1012 1015          ZFS_EXIT(zfsvfs);
1013 1016          return (0);
1014 1017  }
1015 1018  
1016 1019  void
1017 1020  zfs_get_done(zgd_t *zgd, int error)
1018 1021  {
1019 1022          znode_t *zp = zgd->zgd_private;
1020 1023          objset_t *os = zp->z_zfsvfs->z_os;
1021 1024  
1022 1025          if (zgd->zgd_db)
1023 1026                  dmu_buf_rele(zgd->zgd_db, zgd);
1024 1027  
1025 1028          zfs_range_unlock(zgd->zgd_rl);
1026 1029  
1027 1030          /*
1028 1031           * Release the vnode asynchronously as we currently have the
1029 1032           * txg stopped from syncing.
1030 1033           */
1031 1034          VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1032 1035  
1033 1036          if (error == 0 && zgd->zgd_bp)
1034 1037                  zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
1035 1038  
1036 1039          kmem_free(zgd, sizeof (zgd_t));
1037 1040  }
1038 1041  
1039 1042  #ifdef DEBUG
1040 1043  static int zil_fault_io = 0;
1041 1044  #endif
1042 1045  
1043 1046  /*
1044 1047   * Get data to generate a TX_WRITE intent log record.
1045 1048   */
1046 1049  int
1047 1050  zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
1048 1051  {
1049 1052          zfsvfs_t *zfsvfs = arg;
1050 1053          objset_t *os = zfsvfs->z_os;
1051 1054          znode_t *zp;
1052 1055          uint64_t object = lr->lr_foid;
1053 1056          uint64_t offset = lr->lr_offset;
1054 1057          uint64_t size = lr->lr_length;
1055 1058          blkptr_t *bp = &lr->lr_blkptr;
1056 1059          dmu_buf_t *db;
1057 1060          zgd_t *zgd;
1058 1061          int error = 0;
1059 1062  
1060 1063          ASSERT(zio != NULL);
1061 1064          ASSERT(size != 0);
1062 1065  
1063 1066          /*
1064 1067           * Nothing to do if the file has been removed
1065 1068           */
1066 1069          if (zfs_zget(zfsvfs, object, &zp) != 0)
1067 1070                  return (SET_ERROR(ENOENT));
1068 1071          if (zp->z_unlinked) {
1069 1072                  /*
1070 1073                   * Release the vnode asynchronously as we currently have the
1071 1074                   * txg stopped from syncing.
1072 1075                   */
1073 1076                  VN_RELE_ASYNC(ZTOV(zp),
1074 1077                      dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1075 1078                  return (SET_ERROR(ENOENT));
1076 1079          }
1077 1080  
1078 1081          zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1079 1082          zgd->zgd_zilog = zfsvfs->z_log;
1080 1083          zgd->zgd_private = zp;
1081 1084  
1082 1085          /*
1083 1086           * Write records come in two flavors: immediate and indirect.
1084 1087           * For small writes it's cheaper to store the data with the
1085 1088           * log record (immediate); for large writes it's cheaper to
1086 1089           * sync the data and get a pointer to it (indirect) so that
1087 1090           * we don't have to write the data twice.
1088 1091           */
1089 1092          if (buf != NULL) { /* immediate write */
1090 1093                  zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
1091 1094                  /* test for truncation needs to be done while range locked */
1092 1095                  if (offset >= zp->z_size) {
1093 1096                          error = SET_ERROR(ENOENT);
1094 1097                  } else {
1095 1098                          error = dmu_read(os, object, offset, size, buf,
1096 1099                              DMU_READ_NO_PREFETCH);
1097 1100                  }
1098 1101                  ASSERT(error == 0 || error == ENOENT);
1099 1102          } else { /* indirect write */
1100 1103                  /*
1101 1104                   * Have to lock the whole block to ensure when it's
1102 1105                   * written out and it's checksum is being calculated
1103 1106                   * that no one can change the data. We need to re-check
1104 1107                   * blocksize after we get the lock in case it's changed!
1105 1108                   */
1106 1109                  for (;;) {
1107 1110                          uint64_t blkoff;
1108 1111                          size = zp->z_blksz;
1109 1112                          blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1110 1113                          offset -= blkoff;
1111 1114                          zgd->zgd_rl = zfs_range_lock(zp, offset, size,
1112 1115                              RL_READER);
1113 1116                          if (zp->z_blksz == size)
1114 1117                                  break;
1115 1118                          offset += blkoff;
1116 1119                          zfs_range_unlock(zgd->zgd_rl);
1117 1120                  }
1118 1121                  /* test for truncation needs to be done while range locked */
1119 1122                  if (lr->lr_offset >= zp->z_size)
1120 1123                          error = SET_ERROR(ENOENT);
1121 1124  #ifdef DEBUG
1122 1125                  if (zil_fault_io) {
1123 1126                          error = SET_ERROR(EIO);
1124 1127                          zil_fault_io = 0;
1125 1128                  }
1126 1129  #endif
1127 1130                  if (error == 0)
1128 1131                          error = dmu_buf_hold(os, object, offset, zgd, &db,
1129 1132                              DMU_READ_NO_PREFETCH);
1130 1133  
1131 1134                  if (error == 0) {
1132 1135                          blkptr_t *obp = dmu_buf_get_blkptr(db);
1133 1136                          if (obp) {
1134 1137                                  ASSERT(BP_IS_HOLE(bp));
1135 1138                                  *bp = *obp;
1136 1139                          }
1137 1140  
1138 1141                          zgd->zgd_db = db;
1139 1142                          zgd->zgd_bp = bp;
1140 1143  
1141 1144                          ASSERT(db->db_offset == offset);
1142 1145                          ASSERT(db->db_size == size);
1143 1146  
1144 1147                          error = dmu_sync(zio, lr->lr_common.lrc_txg,
1145 1148                              zfs_get_done, zgd);
1146 1149                          ASSERT(error || lr->lr_length <= zp->z_blksz);
1147 1150  
1148 1151                          /*
1149 1152                           * On success, we need to wait for the write I/O
1150 1153                           * initiated by dmu_sync() to complete before we can
1151 1154                           * release this dbuf.  We will finish everything up
1152 1155                           * in the zfs_get_done() callback.
1153 1156                           */
1154 1157                          if (error == 0)
1155 1158                                  return (0);
1156 1159  
1157 1160                          if (error == EALREADY) {
1158 1161                                  lr->lr_common.lrc_txtype = TX_WRITE2;
1159 1162                                  error = 0;
1160 1163                          }
1161 1164                  }
1162 1165          }
1163 1166  
1164 1167          zfs_get_done(zgd, error);
1165 1168  
1166 1169          return (error);
1167 1170  }
1168 1171  
1169 1172  /*ARGSUSED*/
1170 1173  static int
1171 1174  zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1172 1175      caller_context_t *ct)
1173 1176  {
1174 1177          znode_t *zp = VTOZ(vp);
1175 1178          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1176 1179          int error;
1177 1180  
1178 1181          ZFS_ENTER(zfsvfs);
1179 1182          ZFS_VERIFY_ZP(zp);
1180 1183  
1181 1184          if (flag & V_ACE_MASK)
1182 1185                  error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1183 1186          else
1184 1187                  error = zfs_zaccess_rwx(zp, mode, flag, cr);
1185 1188  
1186 1189          ZFS_EXIT(zfsvfs);
1187 1190          return (error);
1188 1191  }
1189 1192  
1190 1193  /*
1191 1194   * If vnode is for a device return a specfs vnode instead.
1192 1195   */
1193 1196  static int
1194 1197  specvp_check(vnode_t **vpp, cred_t *cr)
1195 1198  {
1196 1199          int error = 0;
1197 1200  
1198 1201          if (IS_DEVVP(*vpp)) {
1199 1202                  struct vnode *svp;
1200 1203  
1201 1204                  svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1202 1205                  VN_RELE(*vpp);
1203 1206                  if (svp == NULL)
1204 1207                          error = SET_ERROR(ENOSYS);
1205 1208                  *vpp = svp;
1206 1209          }
1207 1210          return (error);
1208 1211  }
1209 1212  
1210 1213  
1211 1214  /*
1212 1215   * Lookup an entry in a directory, or an extended attribute directory.
1213 1216   * If it exists, return a held vnode reference for it.
1214 1217   *
1215 1218   *      IN:     dvp     - vnode of directory to search.
1216 1219   *              nm      - name of entry to lookup.
1217 1220   *              pnp     - full pathname to lookup [UNUSED].
1218 1221   *              flags   - LOOKUP_XATTR set if looking for an attribute.
1219 1222   *              rdir    - root directory vnode [UNUSED].
1220 1223   *              cr      - credentials of caller.
1221 1224   *              ct      - caller context
1222 1225   *              direntflags - directory lookup flags
1223 1226   *              realpnp - returned pathname.
1224 1227   *
1225 1228   *      OUT:    vpp     - vnode of located entry, NULL if not found.
1226 1229   *
1227 1230   *      RETURN: 0 on success, error code on failure.
1228 1231   *
1229 1232   * Timestamps:
1230 1233   *      NA
1231 1234   */
1232 1235  /* ARGSUSED */
1233 1236  static int
1234 1237  zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1235 1238      int flags, vnode_t *rdir, cred_t *cr,  caller_context_t *ct,
1236 1239      int *direntflags, pathname_t *realpnp)
1237 1240  {
1238 1241          znode_t *zdp = VTOZ(dvp);
1239 1242          zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1240 1243          int     error = 0;
1241 1244  
1242 1245          /* fast path */
1243 1246          if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1244 1247  
1245 1248                  if (dvp->v_type != VDIR) {
1246 1249                          return (SET_ERROR(ENOTDIR));
1247 1250                  } else if (zdp->z_sa_hdl == NULL) {
1248 1251                          return (SET_ERROR(EIO));
1249 1252                  }
1250 1253  
1251 1254                  if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1252 1255                          error = zfs_fastaccesschk_execute(zdp, cr);
1253 1256                          if (!error) {
1254 1257                                  *vpp = dvp;
1255 1258                                  VN_HOLD(*vpp);
1256 1259                                  return (0);
1257 1260                          }
1258 1261                          return (error);
1259 1262                  } else {
1260 1263                          vnode_t *tvp = dnlc_lookup(dvp, nm);
1261 1264  
1262 1265                          if (tvp) {
1263 1266                                  error = zfs_fastaccesschk_execute(zdp, cr);
1264 1267                                  if (error) {
1265 1268                                          VN_RELE(tvp);
1266 1269                                          return (error);
1267 1270                                  }
1268 1271                                  if (tvp == DNLC_NO_VNODE) {
1269 1272                                          VN_RELE(tvp);
1270 1273                                          return (SET_ERROR(ENOENT));
1271 1274                                  } else {
1272 1275                                          *vpp = tvp;
1273 1276                                          return (specvp_check(vpp, cr));
1274 1277                                  }
1275 1278                          }
1276 1279                  }
1277 1280          }
1278 1281  
1279 1282          DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1280 1283  
1281 1284          ZFS_ENTER(zfsvfs);
1282 1285          ZFS_VERIFY_ZP(zdp);
1283 1286  
1284 1287          *vpp = NULL;
1285 1288  
1286 1289          if (flags & LOOKUP_XATTR) {
1287 1290                  /*
1288 1291                   * If the xattr property is off, refuse the lookup request.
1289 1292                   */
1290 1293                  if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1291 1294                          ZFS_EXIT(zfsvfs);
1292 1295                          return (SET_ERROR(EINVAL));
1293 1296                  }
1294 1297  
1295 1298                  /*
1296 1299                   * We don't allow recursive attributes..
1297 1300                   * Maybe someday we will.
1298 1301                   */
1299 1302                  if (zdp->z_pflags & ZFS_XATTR) {
1300 1303                          ZFS_EXIT(zfsvfs);
1301 1304                          return (SET_ERROR(EINVAL));
1302 1305                  }
1303 1306  
1304 1307                  if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1305 1308                          ZFS_EXIT(zfsvfs);
1306 1309                          return (error);
1307 1310                  }
1308 1311  
1309 1312                  /*
1310 1313                   * Do we have permission to get into attribute directory?
1311 1314                   */
1312 1315  
1313 1316                  if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1314 1317                      B_FALSE, cr)) {
1315 1318                          VN_RELE(*vpp);
1316 1319                          *vpp = NULL;
1317 1320                  }
1318 1321  
1319 1322                  ZFS_EXIT(zfsvfs);
1320 1323                  return (error);
1321 1324          }
1322 1325  
1323 1326          if (dvp->v_type != VDIR) {
1324 1327                  ZFS_EXIT(zfsvfs);
1325 1328                  return (SET_ERROR(ENOTDIR));
1326 1329          }
1327 1330  
1328 1331          /*
1329 1332           * Check accessibility of directory.
1330 1333           */
1331 1334  
1332 1335          if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1333 1336                  ZFS_EXIT(zfsvfs);
1334 1337                  return (error);
1335 1338          }
1336 1339  
1337 1340          if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1338 1341              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1339 1342                  ZFS_EXIT(zfsvfs);
1340 1343                  return (SET_ERROR(EILSEQ));
1341 1344          }
1342 1345  
1343 1346          error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1344 1347          if (error == 0)
1345 1348                  error = specvp_check(vpp, cr);
1346 1349  
1347 1350          ZFS_EXIT(zfsvfs);
1348 1351          return (error);
1349 1352  }
1350 1353  
1351 1354  /*
1352 1355   * Attempt to create a new entry in a directory.  If the entry
1353 1356   * already exists, truncate the file if permissible, else return
1354 1357   * an error.  Return the vp of the created or trunc'd file.
1355 1358   *
1356 1359   *      IN:     dvp     - vnode of directory to put new file entry in.
1357 1360   *              name    - name of new file entry.
1358 1361   *              vap     - attributes of new file.
1359 1362   *              excl    - flag indicating exclusive or non-exclusive mode.
1360 1363   *              mode    - mode to open file with.
1361 1364   *              cr      - credentials of caller.
1362 1365   *              flag    - large file flag [UNUSED].
1363 1366   *              ct      - caller context
1364 1367   *              vsecp   - ACL to be set
1365 1368   *
1366 1369   *      OUT:    vpp     - vnode of created or trunc'd entry.
1367 1370   *
1368 1371   *      RETURN: 0 on success, error code on failure.
1369 1372   *
1370 1373   * Timestamps:
1371 1374   *      dvp - ctime|mtime updated if new entry created
1372 1375   *       vp - ctime|mtime always, atime if new
1373 1376   */
1374 1377  
1375 1378  /* ARGSUSED */
1376 1379  static int
1377 1380  zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1378 1381      int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1379 1382      vsecattr_t *vsecp)
1380 1383  {
1381 1384          znode_t         *zp, *dzp = VTOZ(dvp);
1382 1385          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1383 1386          zilog_t         *zilog;
1384 1387          objset_t        *os;
1385 1388          zfs_dirlock_t   *dl;
1386 1389          dmu_tx_t        *tx;
1387 1390          int             error;
1388 1391          ksid_t          *ksid;
1389 1392          uid_t           uid;
1390 1393          gid_t           gid = crgetgid(cr);
1391 1394          zfs_acl_ids_t   acl_ids;
1392 1395          boolean_t       fuid_dirtied;
1393 1396          boolean_t       have_acl = B_FALSE;
1394 1397          boolean_t       waited = B_FALSE;
1395 1398  
1396 1399          /*
1397 1400           * If we have an ephemeral id, ACL, or XVATTR then
1398 1401           * make sure file system is at proper version
1399 1402           */
1400 1403  
1401 1404          ksid = crgetsid(cr, KSID_OWNER);
1402 1405          if (ksid)
1403 1406                  uid = ksid_getid(ksid);
1404 1407          else
1405 1408                  uid = crgetuid(cr);
1406 1409  
1407 1410          if (zfsvfs->z_use_fuids == B_FALSE &&
1408 1411              (vsecp || (vap->va_mask & AT_XVATTR) ||
1409 1412              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1410 1413                  return (SET_ERROR(EINVAL));
1411 1414  
1412 1415          ZFS_ENTER(zfsvfs);
1413 1416          ZFS_VERIFY_ZP(dzp);
1414 1417          os = zfsvfs->z_os;
1415 1418          zilog = zfsvfs->z_log;
1416 1419  
1417 1420          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1418 1421              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1419 1422                  ZFS_EXIT(zfsvfs);
1420 1423                  return (SET_ERROR(EILSEQ));
1421 1424          }
1422 1425  
1423 1426          if (vap->va_mask & AT_XVATTR) {
1424 1427                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1425 1428                      crgetuid(cr), cr, vap->va_type)) != 0) {
1426 1429                          ZFS_EXIT(zfsvfs);
1427 1430                          return (error);
1428 1431                  }
1429 1432          }
1430 1433  top:
1431 1434          *vpp = NULL;
1432 1435  
1433 1436          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1434 1437                  vap->va_mode &= ~VSVTX;
1435 1438  
1436 1439          if (*name == '\0') {
1437 1440                  /*
1438 1441                   * Null component name refers to the directory itself.
1439 1442                   */
1440 1443                  VN_HOLD(dvp);
1441 1444                  zp = dzp;
1442 1445                  dl = NULL;
1443 1446                  error = 0;
1444 1447          } else {
1445 1448                  /* possible VN_HOLD(zp) */
1446 1449                  int zflg = 0;
1447 1450  
1448 1451                  if (flag & FIGNORECASE)
1449 1452                          zflg |= ZCILOOK;
1450 1453  
1451 1454                  error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1452 1455                      NULL, NULL);
1453 1456                  if (error) {
1454 1457                          if (have_acl)
1455 1458                                  zfs_acl_ids_free(&acl_ids);
1456 1459                          if (strcmp(name, "..") == 0)
1457 1460                                  error = SET_ERROR(EISDIR);
1458 1461                          ZFS_EXIT(zfsvfs);
1459 1462                          return (error);
1460 1463                  }
1461 1464          }
1462 1465  
1463 1466          if (zp == NULL) {
1464 1467                  uint64_t txtype;
1465 1468  
1466 1469                  /*
1467 1470                   * Create a new file object and update the directory
1468 1471                   * to reference it.
1469 1472                   */
1470 1473                  if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1471 1474                          if (have_acl)
1472 1475                                  zfs_acl_ids_free(&acl_ids);
1473 1476                          goto out;
1474 1477                  }
1475 1478  
1476 1479                  /*
1477 1480                   * We only support the creation of regular files in
1478 1481                   * extended attribute directories.
1479 1482                   */
1480 1483  
1481 1484                  if ((dzp->z_pflags & ZFS_XATTR) &&
1482 1485                      (vap->va_type != VREG)) {
1483 1486                          if (have_acl)
1484 1487                                  zfs_acl_ids_free(&acl_ids);
1485 1488                          error = SET_ERROR(EINVAL);
1486 1489                          goto out;
1487 1490                  }
1488 1491  
1489 1492                  if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1490 1493                      cr, vsecp, &acl_ids)) != 0)
1491 1494                          goto out;
1492 1495                  have_acl = B_TRUE;
1493 1496  
1494 1497                  if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1495 1498                          zfs_acl_ids_free(&acl_ids);
1496 1499                          error = SET_ERROR(EDQUOT);
1497 1500                          goto out;
1498 1501                  }
1499 1502  
1500 1503                  tx = dmu_tx_create(os);
1501 1504  
1502 1505                  dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1503 1506                      ZFS_SA_BASE_ATTR_SIZE);
1504 1507  
1505 1508                  fuid_dirtied = zfsvfs->z_fuid_dirty;
1506 1509                  if (fuid_dirtied)
1507 1510                          zfs_fuid_txhold(zfsvfs, tx);
1508 1511                  dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1509 1512                  dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1510 1513                  if (!zfsvfs->z_use_sa &&
1511 1514                      acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1512 1515                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1513 1516                              0, acl_ids.z_aclp->z_acl_bytes);
1514 1517                  }
1515 1518                  error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1516 1519                  if (error) {
1517 1520                          zfs_dirent_unlock(dl);
1518 1521                          if (error == ERESTART) {
1519 1522                                  waited = B_TRUE;
1520 1523                                  dmu_tx_wait(tx);
1521 1524                                  dmu_tx_abort(tx);
1522 1525                                  goto top;
1523 1526                          }
1524 1527                          zfs_acl_ids_free(&acl_ids);
1525 1528                          dmu_tx_abort(tx);
1526 1529                          ZFS_EXIT(zfsvfs);
1527 1530                          return (error);
1528 1531                  }
1529 1532                  zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1530 1533  
1531 1534                  if (fuid_dirtied)
1532 1535                          zfs_fuid_sync(zfsvfs, tx);
1533 1536  
1534 1537                  (void) zfs_link_create(dl, zp, tx, ZNEW);
1535 1538                  txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1536 1539                  if (flag & FIGNORECASE)
1537 1540                          txtype |= TX_CI;
1538 1541                  zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1539 1542                      vsecp, acl_ids.z_fuidp, vap);
1540 1543                  zfs_acl_ids_free(&acl_ids);
1541 1544                  dmu_tx_commit(tx);
1542 1545          } else {
1543 1546                  int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1544 1547  
1545 1548                  if (have_acl)
1546 1549                          zfs_acl_ids_free(&acl_ids);
1547 1550                  have_acl = B_FALSE;
1548 1551  
1549 1552                  /*
1550 1553                   * A directory entry already exists for this name.
1551 1554                   */
1552 1555                  /*
1553 1556                   * Can't truncate an existing file if in exclusive mode.
1554 1557                   */
1555 1558                  if (excl == EXCL) {
1556 1559                          error = SET_ERROR(EEXIST);
1557 1560                          goto out;
1558 1561                  }
1559 1562                  /*
1560 1563                   * Can't open a directory for writing.
1561 1564                   */
1562 1565                  if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1563 1566                          error = SET_ERROR(EISDIR);
1564 1567                          goto out;
1565 1568                  }
1566 1569                  /*
1567 1570                   * Verify requested access to file.
1568 1571                   */
1569 1572                  if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1570 1573                          goto out;
1571 1574                  }
1572 1575  
1573 1576                  mutex_enter(&dzp->z_lock);
1574 1577                  dzp->z_seq++;
1575 1578                  mutex_exit(&dzp->z_lock);
1576 1579  
1577 1580                  /*
1578 1581                   * Truncate regular files if requested.
1579 1582                   */
1580 1583                  if ((ZTOV(zp)->v_type == VREG) &&
1581 1584                      (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1582 1585                          /* we can't hold any locks when calling zfs_freesp() */
1583 1586                          zfs_dirent_unlock(dl);
1584 1587                          dl = NULL;
1585 1588                          error = zfs_freesp(zp, 0, 0, mode, TRUE);
1586 1589                          if (error == 0) {
1587 1590                                  vnevent_create(ZTOV(zp), ct);
1588 1591                          }
1589 1592                  }
1590 1593          }
1591 1594  out:
1592 1595  
1593 1596          if (dl)
1594 1597                  zfs_dirent_unlock(dl);
1595 1598  
1596 1599          if (error) {
1597 1600                  if (zp)
1598 1601                          VN_RELE(ZTOV(zp));
1599 1602          } else {
1600 1603                  *vpp = ZTOV(zp);
1601 1604                  error = specvp_check(vpp, cr);
1602 1605          }
1603 1606  
1604 1607          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1605 1608                  zil_commit(zilog, 0);
1606 1609  
1607 1610          ZFS_EXIT(zfsvfs);
1608 1611          return (error);
1609 1612  }
1610 1613  
1611 1614  /*
1612 1615   * Remove an entry from a directory.
1613 1616   *
1614 1617   *      IN:     dvp     - vnode of directory to remove entry from.
1615 1618   *              name    - name of entry to remove.
1616 1619   *              cr      - credentials of caller.
1617 1620   *              ct      - caller context
1618 1621   *              flags   - case flags
1619 1622   *
1620 1623   *      RETURN: 0 on success, error code on failure.
1621 1624   *
1622 1625   * Timestamps:
1623 1626   *      dvp - ctime|mtime
1624 1627   *       vp - ctime (if nlink > 0)
1625 1628   */
1626 1629  
1627 1630  uint64_t null_xattr = 0;
1628 1631  
1629 1632  /*ARGSUSED*/
1630 1633  static int
1631 1634  zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1632 1635      int flags)
1633 1636  {
1634 1637          znode_t         *zp, *dzp = VTOZ(dvp);
1635 1638          znode_t         *xzp;
1636 1639          vnode_t         *vp;
1637 1640          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1638 1641          zilog_t         *zilog;
1639 1642          uint64_t        acl_obj, xattr_obj;
1640 1643          uint64_t        xattr_obj_unlinked = 0;
1641 1644          uint64_t        obj = 0;
1642 1645          zfs_dirlock_t   *dl;
1643 1646          dmu_tx_t        *tx;
1644 1647          boolean_t       may_delete_now, delete_now = FALSE;
1645 1648          boolean_t       unlinked, toobig = FALSE;
1646 1649          uint64_t        txtype;
1647 1650          pathname_t      *realnmp = NULL;
1648 1651          pathname_t      realnm;
1649 1652          int             error;
1650 1653          int             zflg = ZEXISTS;
1651 1654          boolean_t       waited = B_FALSE;
1652 1655  
1653 1656          ZFS_ENTER(zfsvfs);
1654 1657          ZFS_VERIFY_ZP(dzp);
1655 1658          zilog = zfsvfs->z_log;
1656 1659  
1657 1660          if (flags & FIGNORECASE) {
1658 1661                  zflg |= ZCILOOK;
1659 1662                  pn_alloc(&realnm);
1660 1663                  realnmp = &realnm;
1661 1664          }
1662 1665  
1663 1666  top:
1664 1667          xattr_obj = 0;
1665 1668          xzp = NULL;
1666 1669          /*
1667 1670           * Attempt to lock directory; fail if entry doesn't exist.
1668 1671           */
1669 1672          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1670 1673              NULL, realnmp)) {
1671 1674                  if (realnmp)
1672 1675                          pn_free(realnmp);
1673 1676                  ZFS_EXIT(zfsvfs);
1674 1677                  return (error);
1675 1678          }
1676 1679  
1677 1680          vp = ZTOV(zp);
1678 1681  
1679 1682          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1680 1683                  goto out;
1681 1684          }
1682 1685  
1683 1686          /*
1684 1687           * Need to use rmdir for removing directories.
1685 1688           */
1686 1689          if (vp->v_type == VDIR) {
1687 1690                  error = SET_ERROR(EPERM);
1688 1691                  goto out;
1689 1692          }
1690 1693  
1691 1694          vnevent_remove(vp, dvp, name, ct);
1692 1695  
1693 1696          if (realnmp)
1694 1697                  dnlc_remove(dvp, realnmp->pn_buf);
1695 1698          else
1696 1699                  dnlc_remove(dvp, name);
1697 1700  
1698 1701          mutex_enter(&vp->v_lock);
1699 1702          may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1700 1703          mutex_exit(&vp->v_lock);
1701 1704  
1702 1705          /*
1703 1706           * We may delete the znode now, or we may put it in the unlinked set;
1704 1707           * it depends on whether we're the last link, and on whether there are
1705 1708           * other holds on the vnode.  So we dmu_tx_hold() the right things to
1706 1709           * allow for either case.
1707 1710           */
1708 1711          obj = zp->z_id;
1709 1712          tx = dmu_tx_create(zfsvfs->z_os);
1710 1713          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1711 1714          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1712 1715          zfs_sa_upgrade_txholds(tx, zp);
1713 1716          zfs_sa_upgrade_txholds(tx, dzp);
1714 1717          if (may_delete_now) {
1715 1718                  toobig =
1716 1719                      zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1717 1720                  /* if the file is too big, only hold_free a token amount */
1718 1721                  dmu_tx_hold_free(tx, zp->z_id, 0,
1719 1722                      (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1720 1723          }
1721 1724  
1722 1725          /* are there any extended attributes? */
1723 1726          error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1724 1727              &xattr_obj, sizeof (xattr_obj));
1725 1728          if (error == 0 && xattr_obj) {
1726 1729                  error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1727 1730                  ASSERT0(error);
1728 1731                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1729 1732                  dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1730 1733          }
1731 1734  
1732 1735          mutex_enter(&zp->z_lock);
1733 1736          if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1734 1737                  dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1735 1738          mutex_exit(&zp->z_lock);
1736 1739  
1737 1740          /* charge as an update -- would be nice not to charge at all */
1738 1741          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1739 1742  
1740 1743          /*
1741 1744           * Mark this transaction as typically resulting in a net free of space
1742 1745           */
1743 1746          dmu_tx_mark_netfree(tx);
1744 1747  
1745 1748          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1746 1749          if (error) {
1747 1750                  zfs_dirent_unlock(dl);
1748 1751                  VN_RELE(vp);
1749 1752                  if (xzp)
1750 1753                          VN_RELE(ZTOV(xzp));
1751 1754                  if (error == ERESTART) {
1752 1755                          waited = B_TRUE;
1753 1756                          dmu_tx_wait(tx);
1754 1757                          dmu_tx_abort(tx);
1755 1758                          goto top;
1756 1759                  }
1757 1760                  if (realnmp)
1758 1761                          pn_free(realnmp);
1759 1762                  dmu_tx_abort(tx);
1760 1763                  ZFS_EXIT(zfsvfs);
1761 1764                  return (error);
1762 1765          }
1763 1766  
1764 1767          /*
1765 1768           * Remove the directory entry.
1766 1769           */
1767 1770          error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1768 1771  
1769 1772          if (error) {
1770 1773                  dmu_tx_commit(tx);
1771 1774                  goto out;
1772 1775          }
1773 1776  
1774 1777          if (unlinked) {
1775 1778                  /*
1776 1779                   * Hold z_lock so that we can make sure that the ACL obj
1777 1780                   * hasn't changed.  Could have been deleted due to
1778 1781                   * zfs_sa_upgrade().
1779 1782                   */
1780 1783                  mutex_enter(&zp->z_lock);
1781 1784                  mutex_enter(&vp->v_lock);
1782 1785                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1783 1786                      &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1784 1787                  delete_now = may_delete_now && !toobig &&
1785 1788                      vp->v_count == 1 && !vn_has_cached_data(vp) &&
1786 1789                      xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1787 1790                      acl_obj;
1788 1791                  mutex_exit(&vp->v_lock);
1789 1792          }
1790 1793  
1791 1794          if (delete_now) {
1792 1795                  if (xattr_obj_unlinked) {
1793 1796                          ASSERT3U(xzp->z_links, ==, 2);
1794 1797                          mutex_enter(&xzp->z_lock);
1795 1798                          xzp->z_unlinked = 1;
1796 1799                          xzp->z_links = 0;
1797 1800                          error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1798 1801                              &xzp->z_links, sizeof (xzp->z_links), tx);
1799 1802                          ASSERT3U(error,  ==,  0);
1800 1803                          mutex_exit(&xzp->z_lock);
1801 1804                          zfs_unlinked_add(xzp, tx);
1802 1805  
1803 1806                          if (zp->z_is_sa)
1804 1807                                  error = sa_remove(zp->z_sa_hdl,
1805 1808                                      SA_ZPL_XATTR(zfsvfs), tx);
1806 1809                          else
1807 1810                                  error = sa_update(zp->z_sa_hdl,
1808 1811                                      SA_ZPL_XATTR(zfsvfs), &null_xattr,
1809 1812                                      sizeof (uint64_t), tx);
1810 1813                          ASSERT0(error);
1811 1814                  }
1812 1815                  mutex_enter(&vp->v_lock);
1813 1816                  vp->v_count--;
1814 1817                  ASSERT0(vp->v_count);
1815 1818                  mutex_exit(&vp->v_lock);
1816 1819                  mutex_exit(&zp->z_lock);
1817 1820                  zfs_znode_delete(zp, tx);
1818 1821          } else if (unlinked) {
1819 1822                  mutex_exit(&zp->z_lock);
1820 1823                  zfs_unlinked_add(zp, tx);
1821 1824          }
1822 1825  
1823 1826          txtype = TX_REMOVE;
1824 1827          if (flags & FIGNORECASE)
1825 1828                  txtype |= TX_CI;
1826 1829          zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1827 1830  
1828 1831          dmu_tx_commit(tx);
1829 1832  out:
1830 1833          if (realnmp)
1831 1834                  pn_free(realnmp);
1832 1835  
1833 1836          zfs_dirent_unlock(dl);
1834 1837  
1835 1838          if (!delete_now)
1836 1839                  VN_RELE(vp);
1837 1840          if (xzp)
1838 1841                  VN_RELE(ZTOV(xzp));
1839 1842  
1840 1843          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1841 1844                  zil_commit(zilog, 0);
1842 1845  
1843 1846          ZFS_EXIT(zfsvfs);
1844 1847          return (error);
1845 1848  }
1846 1849  
1847 1850  /*
1848 1851   * Create a new directory and insert it into dvp using the name
1849 1852   * provided.  Return a pointer to the inserted directory.
1850 1853   *
1851 1854   *      IN:     dvp     - vnode of directory to add subdir to.
1852 1855   *              dirname - name of new directory.
1853 1856   *              vap     - attributes of new directory.
1854 1857   *              cr      - credentials of caller.
1855 1858   *              ct      - caller context
1856 1859   *              flags   - case flags
1857 1860   *              vsecp   - ACL to be set
1858 1861   *
1859 1862   *      OUT:    vpp     - vnode of created directory.
1860 1863   *
1861 1864   *      RETURN: 0 on success, error code on failure.
1862 1865   *
1863 1866   * Timestamps:
1864 1867   *      dvp - ctime|mtime updated
1865 1868   *       vp - ctime|mtime|atime updated
1866 1869   */
1867 1870  /*ARGSUSED*/
1868 1871  static int
1869 1872  zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1870 1873      caller_context_t *ct, int flags, vsecattr_t *vsecp)
1871 1874  {
1872 1875          znode_t         *zp, *dzp = VTOZ(dvp);
1873 1876          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
1874 1877          zilog_t         *zilog;
1875 1878          zfs_dirlock_t   *dl;
1876 1879          uint64_t        txtype;
1877 1880          dmu_tx_t        *tx;
1878 1881          int             error;
1879 1882          int             zf = ZNEW;
1880 1883          ksid_t          *ksid;
1881 1884          uid_t           uid;
1882 1885          gid_t           gid = crgetgid(cr);
1883 1886          zfs_acl_ids_t   acl_ids;
1884 1887          boolean_t       fuid_dirtied;
1885 1888          boolean_t       waited = B_FALSE;
1886 1889  
1887 1890          ASSERT(vap->va_type == VDIR);
1888 1891  
1889 1892          /*
1890 1893           * If we have an ephemeral id, ACL, or XVATTR then
1891 1894           * make sure file system is at proper version
1892 1895           */
1893 1896  
1894 1897          ksid = crgetsid(cr, KSID_OWNER);
1895 1898          if (ksid)
1896 1899                  uid = ksid_getid(ksid);
1897 1900          else
1898 1901                  uid = crgetuid(cr);
1899 1902          if (zfsvfs->z_use_fuids == B_FALSE &&
1900 1903              (vsecp || (vap->va_mask & AT_XVATTR) ||
1901 1904              IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1902 1905                  return (SET_ERROR(EINVAL));
1903 1906  
1904 1907          ZFS_ENTER(zfsvfs);
1905 1908          ZFS_VERIFY_ZP(dzp);
1906 1909          zilog = zfsvfs->z_log;
1907 1910  
1908 1911          if (dzp->z_pflags & ZFS_XATTR) {
1909 1912                  ZFS_EXIT(zfsvfs);
1910 1913                  return (SET_ERROR(EINVAL));
1911 1914          }
1912 1915  
1913 1916          if (zfsvfs->z_utf8 && u8_validate(dirname,
1914 1917              strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1915 1918                  ZFS_EXIT(zfsvfs);
1916 1919                  return (SET_ERROR(EILSEQ));
1917 1920          }
1918 1921          if (flags & FIGNORECASE)
1919 1922                  zf |= ZCILOOK;
1920 1923  
1921 1924          if (vap->va_mask & AT_XVATTR) {
1922 1925                  if ((error = secpolicy_xvattr((xvattr_t *)vap,
1923 1926                      crgetuid(cr), cr, vap->va_type)) != 0) {
1924 1927                          ZFS_EXIT(zfsvfs);
1925 1928                          return (error);
1926 1929                  }
1927 1930          }
1928 1931  
1929 1932          if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1930 1933              vsecp, &acl_ids)) != 0) {
1931 1934                  ZFS_EXIT(zfsvfs);
1932 1935                  return (error);
1933 1936          }
1934 1937          /*
1935 1938           * First make sure the new directory doesn't exist.
1936 1939           *
1937 1940           * Existence is checked first to make sure we don't return
1938 1941           * EACCES instead of EEXIST which can cause some applications
1939 1942           * to fail.
1940 1943           */
1941 1944  top:
1942 1945          *vpp = NULL;
1943 1946  
1944 1947          if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1945 1948              NULL, NULL)) {
1946 1949                  zfs_acl_ids_free(&acl_ids);
1947 1950                  ZFS_EXIT(zfsvfs);
1948 1951                  return (error);
1949 1952          }
1950 1953  
1951 1954          if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1952 1955                  zfs_acl_ids_free(&acl_ids);
1953 1956                  zfs_dirent_unlock(dl);
1954 1957                  ZFS_EXIT(zfsvfs);
1955 1958                  return (error);
1956 1959          }
1957 1960  
1958 1961          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1959 1962                  zfs_acl_ids_free(&acl_ids);
1960 1963                  zfs_dirent_unlock(dl);
1961 1964                  ZFS_EXIT(zfsvfs);
1962 1965                  return (SET_ERROR(EDQUOT));
1963 1966          }
1964 1967  
1965 1968          /*
1966 1969           * Add a new entry to the directory.
1967 1970           */
1968 1971          tx = dmu_tx_create(zfsvfs->z_os);
1969 1972          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1970 1973          dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1971 1974          fuid_dirtied = zfsvfs->z_fuid_dirty;
1972 1975          if (fuid_dirtied)
1973 1976                  zfs_fuid_txhold(zfsvfs, tx);
1974 1977          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1975 1978                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1976 1979                      acl_ids.z_aclp->z_acl_bytes);
1977 1980          }
1978 1981  
1979 1982          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1980 1983              ZFS_SA_BASE_ATTR_SIZE);
1981 1984  
1982 1985          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
1983 1986          if (error) {
1984 1987                  zfs_dirent_unlock(dl);
1985 1988                  if (error == ERESTART) {
1986 1989                          waited = B_TRUE;
1987 1990                          dmu_tx_wait(tx);
1988 1991                          dmu_tx_abort(tx);
1989 1992                          goto top;
1990 1993                  }
1991 1994                  zfs_acl_ids_free(&acl_ids);
1992 1995                  dmu_tx_abort(tx);
1993 1996                  ZFS_EXIT(zfsvfs);
1994 1997                  return (error);
1995 1998          }
1996 1999  
1997 2000          /*
1998 2001           * Create new node.
1999 2002           */
2000 2003          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2001 2004  
2002 2005          if (fuid_dirtied)
2003 2006                  zfs_fuid_sync(zfsvfs, tx);
2004 2007  
2005 2008          /*
2006 2009           * Now put new name in parent dir.
2007 2010           */
2008 2011          (void) zfs_link_create(dl, zp, tx, ZNEW);
2009 2012  
2010 2013          *vpp = ZTOV(zp);
2011 2014  
2012 2015          txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2013 2016          if (flags & FIGNORECASE)
2014 2017                  txtype |= TX_CI;
2015 2018          zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2016 2019              acl_ids.z_fuidp, vap);
2017 2020  
2018 2021          zfs_acl_ids_free(&acl_ids);
2019 2022  
2020 2023          dmu_tx_commit(tx);
2021 2024  
2022 2025          zfs_dirent_unlock(dl);
2023 2026  
2024 2027          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2025 2028                  zil_commit(zilog, 0);
2026 2029  
2027 2030          ZFS_EXIT(zfsvfs);
2028 2031          return (0);
2029 2032  }
2030 2033  
2031 2034  /*
2032 2035   * Remove a directory subdir entry.  If the current working
2033 2036   * directory is the same as the subdir to be removed, the
2034 2037   * remove will fail.
2035 2038   *
2036 2039   *      IN:     dvp     - vnode of directory to remove from.
2037 2040   *              name    - name of directory to be removed.
2038 2041   *              cwd     - vnode of current working directory.
2039 2042   *              cr      - credentials of caller.
2040 2043   *              ct      - caller context
2041 2044   *              flags   - case flags
2042 2045   *
2043 2046   *      RETURN: 0 on success, error code on failure.
2044 2047   *
2045 2048   * Timestamps:
2046 2049   *      dvp - ctime|mtime updated
2047 2050   */
2048 2051  /*ARGSUSED*/
2049 2052  static int
2050 2053  zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2051 2054      caller_context_t *ct, int flags)
2052 2055  {
2053 2056          znode_t         *dzp = VTOZ(dvp);
2054 2057          znode_t         *zp;
2055 2058          vnode_t         *vp;
2056 2059          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
2057 2060          zilog_t         *zilog;
2058 2061          zfs_dirlock_t   *dl;
2059 2062          dmu_tx_t        *tx;
2060 2063          int             error;
2061 2064          int             zflg = ZEXISTS;
2062 2065          boolean_t       waited = B_FALSE;
2063 2066  
2064 2067          ZFS_ENTER(zfsvfs);
2065 2068          ZFS_VERIFY_ZP(dzp);
2066 2069          zilog = zfsvfs->z_log;
2067 2070  
2068 2071          if (flags & FIGNORECASE)
2069 2072                  zflg |= ZCILOOK;
2070 2073  top:
2071 2074          zp = NULL;
2072 2075  
2073 2076          /*
2074 2077           * Attempt to lock directory; fail if entry doesn't exist.
2075 2078           */
2076 2079          if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2077 2080              NULL, NULL)) {
2078 2081                  ZFS_EXIT(zfsvfs);
2079 2082                  return (error);
2080 2083          }
2081 2084  
2082 2085          vp = ZTOV(zp);
2083 2086  
2084 2087          if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2085 2088                  goto out;
2086 2089          }
2087 2090  
2088 2091          if (vp->v_type != VDIR) {
2089 2092                  error = SET_ERROR(ENOTDIR);
2090 2093                  goto out;
2091 2094          }
2092 2095  
2093 2096          if (vp == cwd) {
2094 2097                  error = SET_ERROR(EINVAL);
2095 2098                  goto out;
2096 2099          }
2097 2100  
2098 2101          vnevent_rmdir(vp, dvp, name, ct);
2099 2102  
2100 2103          /*
2101 2104           * Grab a lock on the directory to make sure that noone is
2102 2105           * trying to add (or lookup) entries while we are removing it.
2103 2106           */
2104 2107          rw_enter(&zp->z_name_lock, RW_WRITER);
2105 2108  
2106 2109          /*
2107 2110           * Grab a lock on the parent pointer to make sure we play well
2108 2111           * with the treewalk and directory rename code.
2109 2112           */
2110 2113          rw_enter(&zp->z_parent_lock, RW_WRITER);
2111 2114  
2112 2115          tx = dmu_tx_create(zfsvfs->z_os);
2113 2116          dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2114 2117          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2115 2118          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2116 2119          zfs_sa_upgrade_txholds(tx, zp);
2117 2120          zfs_sa_upgrade_txholds(tx, dzp);
2118 2121          dmu_tx_mark_netfree(tx);
2119 2122          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
2120 2123          if (error) {
2121 2124                  rw_exit(&zp->z_parent_lock);
2122 2125                  rw_exit(&zp->z_name_lock);
2123 2126                  zfs_dirent_unlock(dl);
2124 2127                  VN_RELE(vp);
2125 2128                  if (error == ERESTART) {
2126 2129                          waited = B_TRUE;
2127 2130                          dmu_tx_wait(tx);
2128 2131                          dmu_tx_abort(tx);
2129 2132                          goto top;
2130 2133                  }
2131 2134                  dmu_tx_abort(tx);
2132 2135                  ZFS_EXIT(zfsvfs);
2133 2136                  return (error);
2134 2137          }
2135 2138  
2136 2139          error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2137 2140  
2138 2141          if (error == 0) {
2139 2142                  uint64_t txtype = TX_RMDIR;
2140 2143                  if (flags & FIGNORECASE)
2141 2144                          txtype |= TX_CI;
2142 2145                  zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2143 2146          }
2144 2147  
2145 2148          dmu_tx_commit(tx);
2146 2149  
2147 2150          rw_exit(&zp->z_parent_lock);
2148 2151          rw_exit(&zp->z_name_lock);
2149 2152  out:
2150 2153          zfs_dirent_unlock(dl);
2151 2154  
2152 2155          VN_RELE(vp);
2153 2156  
2154 2157          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2155 2158                  zil_commit(zilog, 0);
2156 2159  
2157 2160          ZFS_EXIT(zfsvfs);
2158 2161          return (error);
2159 2162  }
2160 2163  
2161 2164  /*
2162 2165   * Read as many directory entries as will fit into the provided
2163 2166   * buffer from the given directory cursor position (specified in
2164 2167   * the uio structure).
2165 2168   *
2166 2169   *      IN:     vp      - vnode of directory to read.
2167 2170   *              uio     - structure supplying read location, range info,
2168 2171   *                        and return buffer.
2169 2172   *              cr      - credentials of caller.
2170 2173   *              ct      - caller context
2171 2174   *              flags   - case flags
2172 2175   *
2173 2176   *      OUT:    uio     - updated offset and range, buffer filled.
2174 2177   *              eofp    - set to true if end-of-file detected.
2175 2178   *
2176 2179   *      RETURN: 0 on success, error code on failure.
2177 2180   *
2178 2181   * Timestamps:
2179 2182   *      vp - atime updated
2180 2183   *
2181 2184   * Note that the low 4 bits of the cookie returned by zap is always zero.
2182 2185   * This allows us to use the low range for "special" directory entries:
2183 2186   * We use 0 for '.', and 1 for '..'.  If this is the root of the filesystem,
2184 2187   * we use the offset 2 for the '.zfs' directory.
2185 2188   */
2186 2189  /* ARGSUSED */
2187 2190  static int
2188 2191  zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2189 2192      caller_context_t *ct, int flags)
2190 2193  {
2191 2194          znode_t         *zp = VTOZ(vp);
2192 2195          iovec_t         *iovp;
2193 2196          edirent_t       *eodp;
2194 2197          dirent64_t      *odp;
2195 2198          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2196 2199          objset_t        *os;
2197 2200          caddr_t         outbuf;
2198 2201          size_t          bufsize;
2199 2202          zap_cursor_t    zc;
2200 2203          zap_attribute_t zap;
2201 2204          uint_t          bytes_wanted;
2202 2205          uint64_t        offset; /* must be unsigned; checks for < 1 */
2203 2206          uint64_t        parent;
2204 2207          int             local_eof;
2205 2208          int             outcount;
2206 2209          int             error;
2207 2210          uint8_t         prefetch;
2208 2211          boolean_t       check_sysattrs;
2209 2212  
2210 2213          ZFS_ENTER(zfsvfs);
2211 2214          ZFS_VERIFY_ZP(zp);
2212 2215  
2213 2216          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2214 2217              &parent, sizeof (parent))) != 0) {
2215 2218                  ZFS_EXIT(zfsvfs);
2216 2219                  return (error);
2217 2220          }
2218 2221  
2219 2222          /*
2220 2223           * If we are not given an eof variable,
2221 2224           * use a local one.
2222 2225           */
2223 2226          if (eofp == NULL)
2224 2227                  eofp = &local_eof;
2225 2228  
2226 2229          /*
2227 2230           * Check for valid iov_len.
2228 2231           */
2229 2232          if (uio->uio_iov->iov_len <= 0) {
2230 2233                  ZFS_EXIT(zfsvfs);
2231 2234                  return (SET_ERROR(EINVAL));
2232 2235          }
2233 2236  
2234 2237          /*
2235 2238           * Quit if directory has been removed (posix)
2236 2239           */
2237 2240          if ((*eofp = zp->z_unlinked) != 0) {
2238 2241                  ZFS_EXIT(zfsvfs);
2239 2242                  return (0);
2240 2243          }
2241 2244  
2242 2245          error = 0;
2243 2246          os = zfsvfs->z_os;
2244 2247          offset = uio->uio_loffset;
2245 2248          prefetch = zp->z_zn_prefetch;
2246 2249  
2247 2250          /*
2248 2251           * Initialize the iterator cursor.
2249 2252           */
2250 2253          if (offset <= 3) {
2251 2254                  /*
2252 2255                   * Start iteration from the beginning of the directory.
2253 2256                   */
2254 2257                  zap_cursor_init(&zc, os, zp->z_id);
2255 2258          } else {
2256 2259                  /*
2257 2260                   * The offset is a serialized cursor.
2258 2261                   */
2259 2262                  zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2260 2263          }
2261 2264  
2262 2265          /*
2263 2266           * Get space to change directory entries into fs independent format.
2264 2267           */
2265 2268          iovp = uio->uio_iov;
2266 2269          bytes_wanted = iovp->iov_len;
2267 2270          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2268 2271                  bufsize = bytes_wanted;
2269 2272                  outbuf = kmem_alloc(bufsize, KM_SLEEP);
2270 2273                  odp = (struct dirent64 *)outbuf;
2271 2274          } else {
2272 2275                  bufsize = bytes_wanted;
2273 2276                  outbuf = NULL;
2274 2277                  odp = (struct dirent64 *)iovp->iov_base;
2275 2278          }
2276 2279          eodp = (struct edirent *)odp;
2277 2280  
2278 2281          /*
2279 2282           * If this VFS supports the system attribute view interface; and
2280 2283           * we're looking at an extended attribute directory; and we care
2281 2284           * about normalization conflicts on this vfs; then we must check
2282 2285           * for normalization conflicts with the sysattr name space.
2283 2286           */
2284 2287          check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2285 2288              (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2286 2289              (flags & V_RDDIR_ENTFLAGS);
2287 2290  
2288 2291          /*
2289 2292           * Transform to file-system independent format
2290 2293           */
2291 2294          outcount = 0;
2292 2295          while (outcount < bytes_wanted) {
2293 2296                  ino64_t objnum;
2294 2297                  ushort_t reclen;
2295 2298                  off64_t *next = NULL;
2296 2299  
2297 2300                  /*
2298 2301                   * Special case `.', `..', and `.zfs'.
2299 2302                   */
2300 2303                  if (offset == 0) {
2301 2304                          (void) strcpy(zap.za_name, ".");
2302 2305                          zap.za_normalization_conflict = 0;
2303 2306                          objnum = zp->z_id;
2304 2307                  } else if (offset == 1) {
2305 2308                          (void) strcpy(zap.za_name, "..");
2306 2309                          zap.za_normalization_conflict = 0;
2307 2310                          objnum = parent;
2308 2311                  } else if (offset == 2 && zfs_show_ctldir(zp)) {
2309 2312                          (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2310 2313                          zap.za_normalization_conflict = 0;
2311 2314                          objnum = ZFSCTL_INO_ROOT;
2312 2315                  } else {
2313 2316                          /*
2314 2317                           * Grab next entry.
2315 2318                           */
2316 2319                          if (error = zap_cursor_retrieve(&zc, &zap)) {
2317 2320                                  if ((*eofp = (error == ENOENT)) != 0)
2318 2321                                          break;
2319 2322                                  else
2320 2323                                          goto update;
2321 2324                          }
2322 2325  
2323 2326                          if (zap.za_integer_length != 8 ||
2324 2327                              zap.za_num_integers != 1) {
2325 2328                                  cmn_err(CE_WARN, "zap_readdir: bad directory "
2326 2329                                      "entry, obj = %lld, offset = %lld\n",
2327 2330                                      (u_longlong_t)zp->z_id,
2328 2331                                      (u_longlong_t)offset);
2329 2332                                  error = SET_ERROR(ENXIO);
2330 2333                                  goto update;
2331 2334                          }
2332 2335  
2333 2336                          objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2334 2337                          /*
2335 2338                           * MacOS X can extract the object type here such as:
2336 2339                           * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2337 2340                           */
2338 2341  
2339 2342                          if (check_sysattrs && !zap.za_normalization_conflict) {
2340 2343                                  zap.za_normalization_conflict =
2341 2344                                      xattr_sysattr_casechk(zap.za_name);
2342 2345                          }
2343 2346                  }
2344 2347  
2345 2348                  if (flags & V_RDDIR_ACCFILTER) {
2346 2349                          /*
2347 2350                           * If we have no access at all, don't include
2348 2351                           * this entry in the returned information
2349 2352                           */
2350 2353                          znode_t *ezp;
2351 2354                          if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2352 2355                                  goto skip_entry;
2353 2356                          if (!zfs_has_access(ezp, cr)) {
2354 2357                                  VN_RELE(ZTOV(ezp));
2355 2358                                  goto skip_entry;
2356 2359                          }
2357 2360                          VN_RELE(ZTOV(ezp));
2358 2361                  }
2359 2362  
2360 2363                  if (flags & V_RDDIR_ENTFLAGS)
2361 2364                          reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2362 2365                  else
2363 2366                          reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2364 2367  
2365 2368                  /*
2366 2369                   * Will this entry fit in the buffer?
2367 2370                   */
2368 2371                  if (outcount + reclen > bufsize) {
2369 2372                          /*
2370 2373                           * Did we manage to fit anything in the buffer?
2371 2374                           */
2372 2375                          if (!outcount) {
2373 2376                                  error = SET_ERROR(EINVAL);
2374 2377                                  goto update;
2375 2378                          }
2376 2379                          break;
2377 2380                  }
2378 2381                  if (flags & V_RDDIR_ENTFLAGS) {
2379 2382                          /*
2380 2383                           * Add extended flag entry:
2381 2384                           */
2382 2385                          eodp->ed_ino = objnum;
2383 2386                          eodp->ed_reclen = reclen;
2384 2387                          /* NOTE: ed_off is the offset for the *next* entry */
2385 2388                          next = &(eodp->ed_off);
2386 2389                          eodp->ed_eflags = zap.za_normalization_conflict ?
2387 2390                              ED_CASE_CONFLICT : 0;
2388 2391                          (void) strncpy(eodp->ed_name, zap.za_name,
2389 2392                              EDIRENT_NAMELEN(reclen));
2390 2393                          eodp = (edirent_t *)((intptr_t)eodp + reclen);
2391 2394                  } else {
2392 2395                          /*
2393 2396                           * Add normal entry:
2394 2397                           */
2395 2398                          odp->d_ino = objnum;
2396 2399                          odp->d_reclen = reclen;
2397 2400                          /* NOTE: d_off is the offset for the *next* entry */
2398 2401                          next = &(odp->d_off);
2399 2402                          (void) strncpy(odp->d_name, zap.za_name,
2400 2403                              DIRENT64_NAMELEN(reclen));
2401 2404                          odp = (dirent64_t *)((intptr_t)odp + reclen);
2402 2405                  }
2403 2406                  outcount += reclen;
2404 2407  
2405 2408                  ASSERT(outcount <= bufsize);
2406 2409  
2407 2410                  /* Prefetch znode */
2408 2411                  if (prefetch)
2409 2412                          dmu_prefetch(os, objnum, 0, 0, 0,
2410 2413                              ZIO_PRIORITY_SYNC_READ);
2411 2414  
2412 2415          skip_entry:
2413 2416                  /*
2414 2417                   * Move to the next entry, fill in the previous offset.
2415 2418                   */
2416 2419                  if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2417 2420                          zap_cursor_advance(&zc);
2418 2421                          offset = zap_cursor_serialize(&zc);
2419 2422                  } else {
2420 2423                          offset += 1;
2421 2424                  }
2422 2425                  if (next)
2423 2426                          *next = offset;
2424 2427          }
2425 2428          zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2426 2429  
2427 2430          if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2428 2431                  iovp->iov_base += outcount;
2429 2432                  iovp->iov_len -= outcount;
2430 2433                  uio->uio_resid -= outcount;
2431 2434          } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2432 2435                  /*
2433 2436                   * Reset the pointer.
2434 2437                   */
2435 2438                  offset = uio->uio_loffset;
2436 2439          }
2437 2440  
2438 2441  update:
2439 2442          zap_cursor_fini(&zc);
2440 2443          if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2441 2444                  kmem_free(outbuf, bufsize);
2442 2445  
2443 2446          if (error == ENOENT)
2444 2447                  error = 0;
2445 2448  
2446 2449          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2447 2450  
2448 2451          uio->uio_loffset = offset;
2449 2452          ZFS_EXIT(zfsvfs);
2450 2453          return (error);
2451 2454  }
2452 2455  
2453 2456  ulong_t zfs_fsync_sync_cnt = 4;
2454 2457  
2455 2458  static int
2456 2459  zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2457 2460  {
2458 2461          znode_t *zp = VTOZ(vp);
2459 2462          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2460 2463  
2461 2464          /*
2462 2465           * Regardless of whether this is required for standards conformance,
2463 2466           * this is the logical behavior when fsync() is called on a file with
2464 2467           * dirty pages.  We use B_ASYNC since the ZIL transactions are already
2465 2468           * going to be pushed out as part of the zil_commit().
2466 2469           */
2467 2470          if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2468 2471              (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2469 2472                  (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2470 2473  
2471 2474          (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2472 2475  
2473 2476          if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2474 2477                  ZFS_ENTER(zfsvfs);
2475 2478                  ZFS_VERIFY_ZP(zp);
2476 2479                  zil_commit(zfsvfs->z_log, zp->z_id);
2477 2480                  ZFS_EXIT(zfsvfs);
2478 2481          }
2479 2482          return (0);
2480 2483  }
2481 2484  
2482 2485  
2483 2486  /*
2484 2487   * Get the requested file attributes and place them in the provided
2485 2488   * vattr structure.
2486 2489   *
2487 2490   *      IN:     vp      - vnode of file.
2488 2491   *              vap     - va_mask identifies requested attributes.
2489 2492   *                        If AT_XVATTR set, then optional attrs are requested
2490 2493   *              flags   - ATTR_NOACLCHECK (CIFS server context)
2491 2494   *              cr      - credentials of caller.
2492 2495   *              ct      - caller context
2493 2496   *
2494 2497   *      OUT:    vap     - attribute values.
2495 2498   *
2496 2499   *      RETURN: 0 (always succeeds).
2497 2500   */
2498 2501  /* ARGSUSED */
2499 2502  static int
2500 2503  zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2501 2504      caller_context_t *ct)
2502 2505  {
2503 2506          znode_t *zp = VTOZ(vp);
2504 2507          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2505 2508          int     error = 0;
2506 2509          uint64_t links;
2507 2510          uint64_t mtime[2], ctime[2];
2508 2511          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2509 2512          xoptattr_t *xoap = NULL;
2510 2513          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2511 2514          sa_bulk_attr_t bulk[2];
2512 2515          int count = 0;
2513 2516  
2514 2517          ZFS_ENTER(zfsvfs);
2515 2518          ZFS_VERIFY_ZP(zp);
2516 2519  
2517 2520          zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2518 2521  
2519 2522          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2520 2523          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2521 2524  
2522 2525          if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2523 2526                  ZFS_EXIT(zfsvfs);
2524 2527                  return (error);
2525 2528          }
2526 2529  
2527 2530          /*
2528 2531           * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2529 2532           * Also, if we are the owner don't bother, since owner should
2530 2533           * always be allowed to read basic attributes of file.
2531 2534           */
2532 2535          if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2533 2536              (vap->va_uid != crgetuid(cr))) {
2534 2537                  if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2535 2538                      skipaclchk, cr)) {
2536 2539                          ZFS_EXIT(zfsvfs);
2537 2540                          return (error);
2538 2541                  }
2539 2542          }
2540 2543  
2541 2544          /*
2542 2545           * Return all attributes.  It's cheaper to provide the answer
2543 2546           * than to determine whether we were asked the question.
2544 2547           */
2545 2548  
2546 2549          mutex_enter(&zp->z_lock);
2547 2550          vap->va_type = vp->v_type;
2548 2551          vap->va_mode = zp->z_mode & MODEMASK;
2549 2552          vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2550 2553          vap->va_nodeid = zp->z_id;
2551 2554          if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2552 2555                  links = zp->z_links + 1;
2553 2556          else
2554 2557                  links = zp->z_links;
2555 2558          vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2556 2559          vap->va_size = zp->z_size;
2557 2560          vap->va_rdev = vp->v_rdev;
2558 2561          vap->va_seq = zp->z_seq;
2559 2562  
2560 2563          /*
2561 2564           * Add in any requested optional attributes and the create time.
2562 2565           * Also set the corresponding bits in the returned attribute bitmap.
2563 2566           */
2564 2567          if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2565 2568                  if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2566 2569                          xoap->xoa_archive =
2567 2570                              ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2568 2571                          XVA_SET_RTN(xvap, XAT_ARCHIVE);
2569 2572                  }
2570 2573  
2571 2574                  if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2572 2575                          xoap->xoa_readonly =
2573 2576                              ((zp->z_pflags & ZFS_READONLY) != 0);
2574 2577                          XVA_SET_RTN(xvap, XAT_READONLY);
2575 2578                  }
2576 2579  
2577 2580                  if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2578 2581                          xoap->xoa_system =
2579 2582                              ((zp->z_pflags & ZFS_SYSTEM) != 0);
2580 2583                          XVA_SET_RTN(xvap, XAT_SYSTEM);
2581 2584                  }
2582 2585  
2583 2586                  if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2584 2587                          xoap->xoa_hidden =
2585 2588                              ((zp->z_pflags & ZFS_HIDDEN) != 0);
2586 2589                          XVA_SET_RTN(xvap, XAT_HIDDEN);
2587 2590                  }
2588 2591  
2589 2592                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2590 2593                          xoap->xoa_nounlink =
2591 2594                              ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2592 2595                          XVA_SET_RTN(xvap, XAT_NOUNLINK);
2593 2596                  }
2594 2597  
2595 2598                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2596 2599                          xoap->xoa_immutable =
2597 2600                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2598 2601                          XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2599 2602                  }
2600 2603  
2601 2604                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2602 2605                          xoap->xoa_appendonly =
2603 2606                              ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2604 2607                          XVA_SET_RTN(xvap, XAT_APPENDONLY);
2605 2608                  }
2606 2609  
2607 2610                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2608 2611                          xoap->xoa_nodump =
2609 2612                              ((zp->z_pflags & ZFS_NODUMP) != 0);
2610 2613                          XVA_SET_RTN(xvap, XAT_NODUMP);
2611 2614                  }
2612 2615  
2613 2616                  if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2614 2617                          xoap->xoa_opaque =
2615 2618                              ((zp->z_pflags & ZFS_OPAQUE) != 0);
2616 2619                          XVA_SET_RTN(xvap, XAT_OPAQUE);
2617 2620                  }
2618 2621  
2619 2622                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2620 2623                          xoap->xoa_av_quarantined =
2621 2624                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2622 2625                          XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2623 2626                  }
2624 2627  
2625 2628                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2626 2629                          xoap->xoa_av_modified =
2627 2630                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2628 2631                          XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2629 2632                  }
2630 2633  
2631 2634                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2632 2635                      vp->v_type == VREG) {
2633 2636                          zfs_sa_get_scanstamp(zp, xvap);
2634 2637                  }
2635 2638  
2636 2639                  if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2637 2640                          uint64_t times[2];
2638 2641  
2639 2642                          (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2640 2643                              times, sizeof (times));
2641 2644                          ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2642 2645                          XVA_SET_RTN(xvap, XAT_CREATETIME);
2643 2646                  }
2644 2647  
2645 2648                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2646 2649                          xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2647 2650                          XVA_SET_RTN(xvap, XAT_REPARSE);
2648 2651                  }
2649 2652                  if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2650 2653                          xoap->xoa_generation = zp->z_gen;
2651 2654                          XVA_SET_RTN(xvap, XAT_GEN);
2652 2655                  }
2653 2656  
2654 2657                  if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2655 2658                          xoap->xoa_offline =
2656 2659                              ((zp->z_pflags & ZFS_OFFLINE) != 0);
2657 2660                          XVA_SET_RTN(xvap, XAT_OFFLINE);
2658 2661                  }
2659 2662  
2660 2663                  if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2661 2664                          xoap->xoa_sparse =
2662 2665                              ((zp->z_pflags & ZFS_SPARSE) != 0);
2663 2666                          XVA_SET_RTN(xvap, XAT_SPARSE);
2664 2667                  }
2665 2668          }
2666 2669  
2667 2670          ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2668 2671          ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2669 2672          ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2670 2673  
2671 2674          mutex_exit(&zp->z_lock);
2672 2675  
2673 2676          sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2674 2677  
2675 2678          if (zp->z_blksz == 0) {
2676 2679                  /*
2677 2680                   * Block size hasn't been set; suggest maximal I/O transfers.
2678 2681                   */
2679 2682                  vap->va_blksize = zfsvfs->z_max_blksz;
2680 2683          }
2681 2684  
2682 2685          ZFS_EXIT(zfsvfs);
2683 2686          return (0);
2684 2687  }
2685 2688  
2686 2689  /*
2687 2690   * Set the file attributes to the values contained in the
2688 2691   * vattr structure.
2689 2692   *
2690 2693   *      IN:     vp      - vnode of file to be modified.
2691 2694   *              vap     - new attribute values.
2692 2695   *                        If AT_XVATTR set, then optional attrs are being set
2693 2696   *              flags   - ATTR_UTIME set if non-default time values provided.
2694 2697   *                      - ATTR_NOACLCHECK (CIFS context only).
2695 2698   *              cr      - credentials of caller.
2696 2699   *              ct      - caller context
2697 2700   *
2698 2701   *      RETURN: 0 on success, error code on failure.
2699 2702   *
2700 2703   * Timestamps:
2701 2704   *      vp - ctime updated, mtime updated if size changed.
2702 2705   */
2703 2706  /* ARGSUSED */
2704 2707  static int
2705 2708  zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2706 2709      caller_context_t *ct)
2707 2710  {
2708 2711          znode_t         *zp = VTOZ(vp);
2709 2712          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
2710 2713          zilog_t         *zilog;
2711 2714          dmu_tx_t        *tx;
2712 2715          vattr_t         oldva;
2713 2716          xvattr_t        tmpxvattr;
2714 2717          uint_t          mask = vap->va_mask;
2715 2718          uint_t          saved_mask = 0;
2716 2719          int             trim_mask = 0;
2717 2720          uint64_t        new_mode;
2718 2721          uint64_t        new_uid, new_gid;
2719 2722          uint64_t        xattr_obj;
2720 2723          uint64_t        mtime[2], ctime[2];
2721 2724          znode_t         *attrzp;
2722 2725          int             need_policy = FALSE;
2723 2726          int             err, err2;
2724 2727          zfs_fuid_info_t *fuidp = NULL;
2725 2728          xvattr_t *xvap = (xvattr_t *)vap;       /* vap may be an xvattr_t * */
2726 2729          xoptattr_t      *xoap;
2727 2730          zfs_acl_t       *aclp;
2728 2731          boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2729 2732          boolean_t       fuid_dirtied = B_FALSE;
2730 2733          sa_bulk_attr_t  bulk[7], xattr_bulk[7];
2731 2734          int             count = 0, xattr_count = 0;
2732 2735  
2733 2736          if (mask == 0)
2734 2737                  return (0);
2735 2738  
2736 2739          if (mask & AT_NOSET)
2737 2740                  return (SET_ERROR(EINVAL));
2738 2741  
2739 2742          ZFS_ENTER(zfsvfs);
2740 2743          ZFS_VERIFY_ZP(zp);
2741 2744  
2742 2745          zilog = zfsvfs->z_log;
2743 2746  
2744 2747          /*
2745 2748           * Make sure that if we have ephemeral uid/gid or xvattr specified
2746 2749           * that file system is at proper version level
2747 2750           */
2748 2751  
2749 2752          if (zfsvfs->z_use_fuids == B_FALSE &&
2750 2753              (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2751 2754              ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2752 2755              (mask & AT_XVATTR))) {
2753 2756                  ZFS_EXIT(zfsvfs);
2754 2757                  return (SET_ERROR(EINVAL));
2755 2758          }
2756 2759  
2757 2760          if (mask & AT_SIZE && vp->v_type == VDIR) {
2758 2761                  ZFS_EXIT(zfsvfs);
2759 2762                  return (SET_ERROR(EISDIR));
2760 2763          }
2761 2764  
2762 2765          if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2763 2766                  ZFS_EXIT(zfsvfs);
2764 2767                  return (SET_ERROR(EINVAL));
2765 2768          }
2766 2769  
2767 2770          /*
2768 2771           * If this is an xvattr_t, then get a pointer to the structure of
2769 2772           * optional attributes.  If this is NULL, then we have a vattr_t.
2770 2773           */
2771 2774          xoap = xva_getxoptattr(xvap);
2772 2775  
2773 2776          xva_init(&tmpxvattr);
2774 2777  
2775 2778          /*
2776 2779           * Immutable files can only alter immutable bit and atime
2777 2780           */
2778 2781          if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2779 2782              ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2780 2783              ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2781 2784                  ZFS_EXIT(zfsvfs);
2782 2785                  return (SET_ERROR(EPERM));
2783 2786          }
2784 2787  
2785 2788          if ((mask & AT_SIZE) && (zp->z_pflags & ZFS_READONLY)) {
2786 2789                  ZFS_EXIT(zfsvfs);
2787 2790                  return (SET_ERROR(EPERM));
2788 2791          }
2789 2792  
2790 2793          /*
2791 2794           * Verify timestamps doesn't overflow 32 bits.
2792 2795           * ZFS can handle large timestamps, but 32bit syscalls can't
2793 2796           * handle times greater than 2039.  This check should be removed
2794 2797           * once large timestamps are fully supported.
2795 2798           */
2796 2799          if (mask & (AT_ATIME | AT_MTIME)) {
2797 2800                  if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2798 2801                      ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2799 2802                          ZFS_EXIT(zfsvfs);
2800 2803                          return (SET_ERROR(EOVERFLOW));
2801 2804                  }
2802 2805          }
2803 2806  
2804 2807  top:
2805 2808          attrzp = NULL;
2806 2809          aclp = NULL;
2807 2810  
2808 2811          /* Can this be moved to before the top label? */
2809 2812          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2810 2813                  ZFS_EXIT(zfsvfs);
2811 2814                  return (SET_ERROR(EROFS));
2812 2815          }
2813 2816  
2814 2817          /*
2815 2818           * First validate permissions
2816 2819           */
2817 2820  
2818 2821          if (mask & AT_SIZE) {
2819 2822                  err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2820 2823                  if (err) {
2821 2824                          ZFS_EXIT(zfsvfs);
2822 2825                          return (err);
2823 2826                  }
2824 2827                  /*
2825 2828                   * XXX - Note, we are not providing any open
2826 2829                   * mode flags here (like FNDELAY), so we may
2827 2830                   * block if there are locks present... this
2828 2831                   * should be addressed in openat().
2829 2832                   */
2830 2833                  /* XXX - would it be OK to generate a log record here? */
2831 2834                  err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2832 2835                  if (err) {
2833 2836                          ZFS_EXIT(zfsvfs);
2834 2837                          return (err);
2835 2838                  }
2836 2839  
2837 2840                  if (vap->va_size == 0) {
2838 2841                          vnevent_truncate(ZTOV(zp), ct);
2839 2842                  } else {
2840 2843                          vnevent_resize(ZTOV(zp), ct);
2841 2844                  }
2842 2845          }
2843 2846  
2844 2847          if (mask & (AT_ATIME|AT_MTIME) ||
2845 2848              ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2846 2849              XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2847 2850              XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2848 2851              XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2849 2852              XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2850 2853              XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2851 2854              XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2852 2855                  need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2853 2856                      skipaclchk, cr);
2854 2857          }
2855 2858  
2856 2859          if (mask & (AT_UID|AT_GID)) {
2857 2860                  int     idmask = (mask & (AT_UID|AT_GID));
2858 2861                  int     take_owner;
2859 2862                  int     take_group;
2860 2863  
2861 2864                  /*
2862 2865                   * NOTE: even if a new mode is being set,
2863 2866                   * we may clear S_ISUID/S_ISGID bits.
2864 2867                   */
2865 2868  
2866 2869                  if (!(mask & AT_MODE))
2867 2870                          vap->va_mode = zp->z_mode;
2868 2871  
2869 2872                  /*
2870 2873                   * Take ownership or chgrp to group we are a member of
2871 2874                   */
2872 2875  
2873 2876                  take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2874 2877                  take_group = (mask & AT_GID) &&
2875 2878                      zfs_groupmember(zfsvfs, vap->va_gid, cr);
2876 2879  
2877 2880                  /*
2878 2881                   * If both AT_UID and AT_GID are set then take_owner and
2879 2882                   * take_group must both be set in order to allow taking
2880 2883                   * ownership.
2881 2884                   *
2882 2885                   * Otherwise, send the check through secpolicy_vnode_setattr()
2883 2886                   *
2884 2887                   */
2885 2888  
2886 2889                  if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2887 2890                      ((idmask == AT_UID) && take_owner) ||
2888 2891                      ((idmask == AT_GID) && take_group)) {
2889 2892                          if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2890 2893                              skipaclchk, cr) == 0) {
2891 2894                                  /*
2892 2895                                   * Remove setuid/setgid for non-privileged users
2893 2896                                   */
2894 2897                                  secpolicy_setid_clear(vap, cr);
2895 2898                                  trim_mask = (mask & (AT_UID|AT_GID));
2896 2899                          } else {
2897 2900                                  need_policy =  TRUE;
2898 2901                          }
2899 2902                  } else {
2900 2903                          need_policy =  TRUE;
2901 2904                  }
2902 2905          }
2903 2906  
2904 2907          mutex_enter(&zp->z_lock);
2905 2908          oldva.va_mode = zp->z_mode;
2906 2909          zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2907 2910          if (mask & AT_XVATTR) {
2908 2911                  /*
2909 2912                   * Update xvattr mask to include only those attributes
2910 2913                   * that are actually changing.
2911 2914                   *
2912 2915                   * the bits will be restored prior to actually setting
2913 2916                   * the attributes so the caller thinks they were set.
2914 2917                   */
2915 2918                  if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2916 2919                          if (xoap->xoa_appendonly !=
2917 2920                              ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2918 2921                                  need_policy = TRUE;
2919 2922                          } else {
2920 2923                                  XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2921 2924                                  XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2922 2925                          }
2923 2926                  }
2924 2927  
2925 2928                  if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2926 2929                          if (xoap->xoa_nounlink !=
2927 2930                              ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2928 2931                                  need_policy = TRUE;
2929 2932                          } else {
2930 2933                                  XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2931 2934                                  XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2932 2935                          }
2933 2936                  }
2934 2937  
2935 2938                  if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2936 2939                          if (xoap->xoa_immutable !=
2937 2940                              ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2938 2941                                  need_policy = TRUE;
2939 2942                          } else {
2940 2943                                  XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2941 2944                                  XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2942 2945                          }
2943 2946                  }
2944 2947  
2945 2948                  if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2946 2949                          if (xoap->xoa_nodump !=
2947 2950                              ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2948 2951                                  need_policy = TRUE;
2949 2952                          } else {
2950 2953                                  XVA_CLR_REQ(xvap, XAT_NODUMP);
2951 2954                                  XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2952 2955                          }
2953 2956                  }
2954 2957  
2955 2958                  if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2956 2959                          if (xoap->xoa_av_modified !=
2957 2960                              ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2958 2961                                  need_policy = TRUE;
2959 2962                          } else {
2960 2963                                  XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2961 2964                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2962 2965                          }
2963 2966                  }
2964 2967  
2965 2968                  if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2966 2969                          if ((vp->v_type != VREG &&
2967 2970                              xoap->xoa_av_quarantined) ||
2968 2971                              xoap->xoa_av_quarantined !=
2969 2972                              ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2970 2973                                  need_policy = TRUE;
2971 2974                          } else {
2972 2975                                  XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2973 2976                                  XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2974 2977                          }
2975 2978                  }
2976 2979  
2977 2980                  if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2978 2981                          mutex_exit(&zp->z_lock);
2979 2982                          ZFS_EXIT(zfsvfs);
2980 2983                          return (SET_ERROR(EPERM));
2981 2984                  }
2982 2985  
2983 2986                  if (need_policy == FALSE &&
2984 2987                      (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2985 2988                      XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2986 2989                          need_policy = TRUE;
2987 2990                  }
2988 2991          }
2989 2992  
2990 2993          mutex_exit(&zp->z_lock);
2991 2994  
2992 2995          if (mask & AT_MODE) {
2993 2996                  if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
2994 2997                          err = secpolicy_setid_setsticky_clear(vp, vap,
2995 2998                              &oldva, cr);
2996 2999                          if (err) {
2997 3000                                  ZFS_EXIT(zfsvfs);
2998 3001                                  return (err);
2999 3002                          }
3000 3003                          trim_mask |= AT_MODE;
3001 3004                  } else {
3002 3005                          need_policy = TRUE;
3003 3006                  }
3004 3007          }
3005 3008  
3006 3009          if (need_policy) {
3007 3010                  /*
3008 3011                   * If trim_mask is set then take ownership
3009 3012                   * has been granted or write_acl is present and user
3010 3013                   * has the ability to modify mode.  In that case remove
3011 3014                   * UID|GID and or MODE from mask so that
3012 3015                   * secpolicy_vnode_setattr() doesn't revoke it.
3013 3016                   */
3014 3017  
3015 3018                  if (trim_mask) {
3016 3019                          saved_mask = vap->va_mask;
3017 3020                          vap->va_mask &= ~trim_mask;
3018 3021                  }
3019 3022                  err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3020 3023                      (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3021 3024                  if (err) {
3022 3025                          ZFS_EXIT(zfsvfs);
3023 3026                          return (err);
3024 3027                  }
3025 3028  
3026 3029                  if (trim_mask)
3027 3030                          vap->va_mask |= saved_mask;
3028 3031          }
3029 3032  
3030 3033          /*
3031 3034           * secpolicy_vnode_setattr, or take ownership may have
3032 3035           * changed va_mask
3033 3036           */
3034 3037          mask = vap->va_mask;
3035 3038  
3036 3039          if ((mask & (AT_UID | AT_GID))) {
3037 3040                  err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3038 3041                      &xattr_obj, sizeof (xattr_obj));
3039 3042  
3040 3043                  if (err == 0 && xattr_obj) {
3041 3044                          err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3042 3045                          if (err)
3043 3046                                  goto out2;
3044 3047                  }
3045 3048                  if (mask & AT_UID) {
3046 3049                          new_uid = zfs_fuid_create(zfsvfs,
3047 3050                              (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3048 3051                          if (new_uid != zp->z_uid &&
3049 3052                              zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3050 3053                                  if (attrzp)
3051 3054                                          VN_RELE(ZTOV(attrzp));
3052 3055                                  err = SET_ERROR(EDQUOT);
3053 3056                                  goto out2;
3054 3057                          }
3055 3058                  }
3056 3059  
3057 3060                  if (mask & AT_GID) {
3058 3061                          new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3059 3062                              cr, ZFS_GROUP, &fuidp);
3060 3063                          if (new_gid != zp->z_gid &&
3061 3064                              zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3062 3065                                  if (attrzp)
3063 3066                                          VN_RELE(ZTOV(attrzp));
3064 3067                                  err = SET_ERROR(EDQUOT);
3065 3068                                  goto out2;
3066 3069                          }
3067 3070                  }
3068 3071          }
3069 3072          tx = dmu_tx_create(zfsvfs->z_os);
3070 3073  
3071 3074          if (mask & AT_MODE) {
3072 3075                  uint64_t pmode = zp->z_mode;
3073 3076                  uint64_t acl_obj;
3074 3077                  new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3075 3078  
3076 3079                  if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3077 3080                      !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3078 3081                          err = SET_ERROR(EPERM);
3079 3082                          goto out;
3080 3083                  }
3081 3084  
3082 3085                  if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3083 3086                          goto out;
3084 3087  
3085 3088                  mutex_enter(&zp->z_lock);
3086 3089                  if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3087 3090                          /*
3088 3091                           * Are we upgrading ACL from old V0 format
3089 3092                           * to V1 format?
3090 3093                           */
3091 3094                          if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3092 3095                              zfs_znode_acl_version(zp) ==
3093 3096                              ZFS_ACL_VERSION_INITIAL) {
3094 3097                                  dmu_tx_hold_free(tx, acl_obj, 0,
3095 3098                                      DMU_OBJECT_END);
3096 3099                                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3097 3100                                      0, aclp->z_acl_bytes);
3098 3101                          } else {
3099 3102                                  dmu_tx_hold_write(tx, acl_obj, 0,
3100 3103                                      aclp->z_acl_bytes);
3101 3104                          }
3102 3105                  } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3103 3106                          dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3104 3107                              0, aclp->z_acl_bytes);
3105 3108                  }
3106 3109                  mutex_exit(&zp->z_lock);
3107 3110                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3108 3111          } else {
3109 3112                  if ((mask & AT_XVATTR) &&
3110 3113                      XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3111 3114                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3112 3115                  else
3113 3116                          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3114 3117          }
3115 3118  
3116 3119          if (attrzp) {
3117 3120                  dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3118 3121          }
3119 3122  
3120 3123          fuid_dirtied = zfsvfs->z_fuid_dirty;
3121 3124          if (fuid_dirtied)
3122 3125                  zfs_fuid_txhold(zfsvfs, tx);
3123 3126  
3124 3127          zfs_sa_upgrade_txholds(tx, zp);
3125 3128  
3126 3129          err = dmu_tx_assign(tx, TXG_WAIT);
3127 3130          if (err)
3128 3131                  goto out;
3129 3132  
3130 3133          count = 0;
3131 3134          /*
3132 3135           * Set each attribute requested.
3133 3136           * We group settings according to the locks they need to acquire.
3134 3137           *
3135 3138           * Note: you cannot set ctime directly, although it will be
3136 3139           * updated as a side-effect of calling this function.
3137 3140           */
3138 3141  
3139 3142  
3140 3143          if (mask & (AT_UID|AT_GID|AT_MODE))
3141 3144                  mutex_enter(&zp->z_acl_lock);
3142 3145          mutex_enter(&zp->z_lock);
3143 3146  
3144 3147          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3145 3148              &zp->z_pflags, sizeof (zp->z_pflags));
3146 3149  
3147 3150          if (attrzp) {
3148 3151                  if (mask & (AT_UID|AT_GID|AT_MODE))
3149 3152                          mutex_enter(&attrzp->z_acl_lock);
3150 3153                  mutex_enter(&attrzp->z_lock);
3151 3154                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3152 3155                      SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3153 3156                      sizeof (attrzp->z_pflags));
3154 3157          }
3155 3158  
3156 3159          if (mask & (AT_UID|AT_GID)) {
3157 3160  
3158 3161                  if (mask & AT_UID) {
3159 3162                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3160 3163                              &new_uid, sizeof (new_uid));
3161 3164                          zp->z_uid = new_uid;
3162 3165                          if (attrzp) {
3163 3166                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3164 3167                                      SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3165 3168                                      sizeof (new_uid));
3166 3169                                  attrzp->z_uid = new_uid;
3167 3170                          }
3168 3171                  }
3169 3172  
3170 3173                  if (mask & AT_GID) {
3171 3174                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3172 3175                              NULL, &new_gid, sizeof (new_gid));
3173 3176                          zp->z_gid = new_gid;
3174 3177                          if (attrzp) {
3175 3178                                  SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3176 3179                                      SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3177 3180                                      sizeof (new_gid));
3178 3181                                  attrzp->z_gid = new_gid;
3179 3182                          }
3180 3183                  }
3181 3184                  if (!(mask & AT_MODE)) {
3182 3185                          SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3183 3186                              NULL, &new_mode, sizeof (new_mode));
3184 3187                          new_mode = zp->z_mode;
3185 3188                  }
3186 3189                  err = zfs_acl_chown_setattr(zp);
3187 3190                  ASSERT(err == 0);
3188 3191                  if (attrzp) {
3189 3192                          err = zfs_acl_chown_setattr(attrzp);
3190 3193                          ASSERT(err == 0);
3191 3194                  }
3192 3195          }
3193 3196  
3194 3197          if (mask & AT_MODE) {
3195 3198                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3196 3199                      &new_mode, sizeof (new_mode));
3197 3200                  zp->z_mode = new_mode;
3198 3201                  ASSERT3U((uintptr_t)aclp, !=, NULL);
3199 3202                  err = zfs_aclset_common(zp, aclp, cr, tx);
3200 3203                  ASSERT0(err);
3201 3204                  if (zp->z_acl_cached)
3202 3205                          zfs_acl_free(zp->z_acl_cached);
3203 3206                  zp->z_acl_cached = aclp;
3204 3207                  aclp = NULL;
3205 3208          }
3206 3209  
3207 3210  
3208 3211          if (mask & AT_ATIME) {
3209 3212                  ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3210 3213                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3211 3214                      &zp->z_atime, sizeof (zp->z_atime));
3212 3215          }
3213 3216  
3214 3217          if (mask & AT_MTIME) {
3215 3218                  ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3216 3219                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3217 3220                      mtime, sizeof (mtime));
3218 3221          }
3219 3222  
3220 3223          /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3221 3224          if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3222 3225                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3223 3226                      NULL, mtime, sizeof (mtime));
3224 3227                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3225 3228                      &ctime, sizeof (ctime));
3226 3229                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3227 3230                      B_TRUE);
3228 3231          } else if (mask != 0) {
3229 3232                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3230 3233                      &ctime, sizeof (ctime));
3231 3234                  zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3232 3235                      B_TRUE);
3233 3236                  if (attrzp) {
3234 3237                          SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3235 3238                              SA_ZPL_CTIME(zfsvfs), NULL,
3236 3239                              &ctime, sizeof (ctime));
3237 3240                          zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3238 3241                              mtime, ctime, B_TRUE);
3239 3242                  }
3240 3243          }
3241 3244          /*
3242 3245           * Do this after setting timestamps to prevent timestamp
3243 3246           * update from toggling bit
3244 3247           */
3245 3248  
3246 3249          if (xoap && (mask & AT_XVATTR)) {
3247 3250  
3248 3251                  /*
3249 3252                   * restore trimmed off masks
3250 3253                   * so that return masks can be set for caller.
3251 3254                   */
3252 3255  
3253 3256                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3254 3257                          XVA_SET_REQ(xvap, XAT_APPENDONLY);
3255 3258                  }
3256 3259                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3257 3260                          XVA_SET_REQ(xvap, XAT_NOUNLINK);
3258 3261                  }
3259 3262                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3260 3263                          XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3261 3264                  }
3262 3265                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3263 3266                          XVA_SET_REQ(xvap, XAT_NODUMP);
3264 3267                  }
3265 3268                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3266 3269                          XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3267 3270                  }
3268 3271                  if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3269 3272                          XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3270 3273                  }
3271 3274  
3272 3275                  if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3273 3276                          ASSERT(vp->v_type == VREG);
3274 3277  
3275 3278                  zfs_xvattr_set(zp, xvap, tx);
3276 3279          }
3277 3280  
3278 3281          if (fuid_dirtied)
3279 3282                  zfs_fuid_sync(zfsvfs, tx);
3280 3283  
3281 3284          if (mask != 0)
3282 3285                  zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3283 3286  
3284 3287          mutex_exit(&zp->z_lock);
3285 3288          if (mask & (AT_UID|AT_GID|AT_MODE))
3286 3289                  mutex_exit(&zp->z_acl_lock);
3287 3290  
3288 3291          if (attrzp) {
3289 3292                  if (mask & (AT_UID|AT_GID|AT_MODE))
3290 3293                          mutex_exit(&attrzp->z_acl_lock);
3291 3294                  mutex_exit(&attrzp->z_lock);
3292 3295          }
3293 3296  out:
3294 3297          if (err == 0 && attrzp) {
3295 3298                  err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3296 3299                      xattr_count, tx);
3297 3300                  ASSERT(err2 == 0);
3298 3301          }
3299 3302  
3300 3303          if (attrzp)
3301 3304                  VN_RELE(ZTOV(attrzp));
3302 3305  
3303 3306          if (aclp)
3304 3307                  zfs_acl_free(aclp);
3305 3308  
3306 3309          if (fuidp) {
3307 3310                  zfs_fuid_info_free(fuidp);
3308 3311                  fuidp = NULL;
3309 3312          }
3310 3313  
3311 3314          if (err) {
3312 3315                  dmu_tx_abort(tx);
3313 3316                  if (err == ERESTART)
3314 3317                          goto top;
3315 3318          } else {
3316 3319                  err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3317 3320                  dmu_tx_commit(tx);
3318 3321          }
3319 3322  
3320 3323  out2:
3321 3324          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3322 3325                  zil_commit(zilog, 0);
3323 3326  
3324 3327          ZFS_EXIT(zfsvfs);
3325 3328          return (err);
3326 3329  }
3327 3330  
3328 3331  typedef struct zfs_zlock {
3329 3332          krwlock_t       *zl_rwlock;     /* lock we acquired */
3330 3333          znode_t         *zl_znode;      /* znode we held */
3331 3334          struct zfs_zlock *zl_next;      /* next in list */
3332 3335  } zfs_zlock_t;
3333 3336  
3334 3337  /*
3335 3338   * Drop locks and release vnodes that were held by zfs_rename_lock().
3336 3339   */
3337 3340  static void
3338 3341  zfs_rename_unlock(zfs_zlock_t **zlpp)
3339 3342  {
3340 3343          zfs_zlock_t *zl;
3341 3344  
3342 3345          while ((zl = *zlpp) != NULL) {
3343 3346                  if (zl->zl_znode != NULL)
3344 3347                          VN_RELE(ZTOV(zl->zl_znode));
3345 3348                  rw_exit(zl->zl_rwlock);
3346 3349                  *zlpp = zl->zl_next;
3347 3350                  kmem_free(zl, sizeof (*zl));
3348 3351          }
3349 3352  }
3350 3353  
3351 3354  /*
3352 3355   * Search back through the directory tree, using the ".." entries.
3353 3356   * Lock each directory in the chain to prevent concurrent renames.
3354 3357   * Fail any attempt to move a directory into one of its own descendants.
3355 3358   * XXX - z_parent_lock can overlap with map or grow locks
3356 3359   */
3357 3360  static int
3358 3361  zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3359 3362  {
3360 3363          zfs_zlock_t     *zl;
3361 3364          znode_t         *zp = tdzp;
3362 3365          uint64_t        rootid = zp->z_zfsvfs->z_root;
3363 3366          uint64_t        oidp = zp->z_id;
3364 3367          krwlock_t       *rwlp = &szp->z_parent_lock;
3365 3368          krw_t           rw = RW_WRITER;
3366 3369  
3367 3370          /*
3368 3371           * First pass write-locks szp and compares to zp->z_id.
3369 3372           * Later passes read-lock zp and compare to zp->z_parent.
3370 3373           */
3371 3374          do {
3372 3375                  if (!rw_tryenter(rwlp, rw)) {
3373 3376                          /*
3374 3377                           * Another thread is renaming in this path.
3375 3378                           * Note that if we are a WRITER, we don't have any
3376 3379                           * parent_locks held yet.
3377 3380                           */
3378 3381                          if (rw == RW_READER && zp->z_id > szp->z_id) {
3379 3382                                  /*
3380 3383                                   * Drop our locks and restart
3381 3384                                   */
3382 3385                                  zfs_rename_unlock(&zl);
3383 3386                                  *zlpp = NULL;
3384 3387                                  zp = tdzp;
3385 3388                                  oidp = zp->z_id;
3386 3389                                  rwlp = &szp->z_parent_lock;
3387 3390                                  rw = RW_WRITER;
3388 3391                                  continue;
3389 3392                          } else {
3390 3393                                  /*
3391 3394                                   * Wait for other thread to drop its locks
3392 3395                                   */
3393 3396                                  rw_enter(rwlp, rw);
3394 3397                          }
3395 3398                  }
3396 3399  
3397 3400                  zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3398 3401                  zl->zl_rwlock = rwlp;
3399 3402                  zl->zl_znode = NULL;
3400 3403                  zl->zl_next = *zlpp;
3401 3404                  *zlpp = zl;
3402 3405  
3403 3406                  if (oidp == szp->z_id)          /* We're a descendant of szp */
3404 3407                          return (SET_ERROR(EINVAL));
3405 3408  
3406 3409                  if (oidp == rootid)             /* We've hit the top */
3407 3410                          return (0);
3408 3411  
3409 3412                  if (rw == RW_READER) {          /* i.e. not the first pass */
3410 3413                          int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3411 3414                          if (error)
3412 3415                                  return (error);
3413 3416                          zl->zl_znode = zp;
3414 3417                  }
3415 3418                  (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3416 3419                      &oidp, sizeof (oidp));
3417 3420                  rwlp = &zp->z_parent_lock;
3418 3421                  rw = RW_READER;
3419 3422  
3420 3423          } while (zp->z_id != sdzp->z_id);
3421 3424  
3422 3425          return (0);
3423 3426  }
3424 3427  
3425 3428  /*
3426 3429   * Move an entry from the provided source directory to the target
3427 3430   * directory.  Change the entry name as indicated.
3428 3431   *
3429 3432   *      IN:     sdvp    - Source directory containing the "old entry".
3430 3433   *              snm     - Old entry name.
3431 3434   *              tdvp    - Target directory to contain the "new entry".
3432 3435   *              tnm     - New entry name.
3433 3436   *              cr      - credentials of caller.
3434 3437   *              ct      - caller context
3435 3438   *              flags   - case flags
3436 3439   *
3437 3440   *      RETURN: 0 on success, error code on failure.
3438 3441   *
3439 3442   * Timestamps:
3440 3443   *      sdvp,tdvp - ctime|mtime updated
3441 3444   */
3442 3445  /*ARGSUSED*/
3443 3446  static int
3444 3447  zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3445 3448      caller_context_t *ct, int flags)
3446 3449  {
3447 3450          znode_t         *tdzp, *szp, *tzp;
3448 3451          znode_t         *sdzp = VTOZ(sdvp);
3449 3452          zfsvfs_t        *zfsvfs = sdzp->z_zfsvfs;
3450 3453          zilog_t         *zilog;
3451 3454          vnode_t         *realvp;
3452 3455          zfs_dirlock_t   *sdl, *tdl;
3453 3456          dmu_tx_t        *tx;
3454 3457          zfs_zlock_t     *zl;
3455 3458          int             cmp, serr, terr;
3456 3459          int             error = 0, rm_err = 0;
3457 3460          int             zflg = 0;
3458 3461          boolean_t       waited = B_FALSE;
3459 3462  
3460 3463          ZFS_ENTER(zfsvfs);
3461 3464          ZFS_VERIFY_ZP(sdzp);
3462 3465          zilog = zfsvfs->z_log;
3463 3466  
3464 3467          /*
3465 3468           * Make sure we have the real vp for the target directory.
3466 3469           */
3467 3470          if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3468 3471                  tdvp = realvp;
3469 3472  
3470 3473          tdzp = VTOZ(tdvp);
3471 3474          ZFS_VERIFY_ZP(tdzp);
3472 3475  
3473 3476          /*
3474 3477           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3475 3478           * ctldir appear to have the same v_vfsp.
3476 3479           */
3477 3480          if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3478 3481                  ZFS_EXIT(zfsvfs);
3479 3482                  return (SET_ERROR(EXDEV));
3480 3483          }
3481 3484  
3482 3485          if (zfsvfs->z_utf8 && u8_validate(tnm,
3483 3486              strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3484 3487                  ZFS_EXIT(zfsvfs);
3485 3488                  return (SET_ERROR(EILSEQ));
3486 3489          }
3487 3490  
3488 3491          if (flags & FIGNORECASE)
3489 3492                  zflg |= ZCILOOK;
3490 3493  
3491 3494  top:
3492 3495          szp = NULL;
3493 3496          tzp = NULL;
3494 3497          zl = NULL;
3495 3498  
3496 3499          /*
3497 3500           * This is to prevent the creation of links into attribute space
3498 3501           * by renaming a linked file into/outof an attribute directory.
3499 3502           * See the comment in zfs_link() for why this is considered bad.
3500 3503           */
3501 3504          if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3502 3505                  ZFS_EXIT(zfsvfs);
3503 3506                  return (SET_ERROR(EINVAL));
3504 3507          }
3505 3508  
3506 3509          /*
3507 3510           * Lock source and target directory entries.  To prevent deadlock,
3508 3511           * a lock ordering must be defined.  We lock the directory with
3509 3512           * the smallest object id first, or if it's a tie, the one with
3510 3513           * the lexically first name.
3511 3514           */
3512 3515          if (sdzp->z_id < tdzp->z_id) {
3513 3516                  cmp = -1;
3514 3517          } else if (sdzp->z_id > tdzp->z_id) {
3515 3518                  cmp = 1;
3516 3519          } else {
3517 3520                  /*
3518 3521                   * First compare the two name arguments without
3519 3522                   * considering any case folding.
3520 3523                   */
3521 3524                  int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3522 3525  
3523 3526                  cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3524 3527                  ASSERT(error == 0 || !zfsvfs->z_utf8);
3525 3528                  if (cmp == 0) {
3526 3529                          /*
3527 3530                           * POSIX: "If the old argument and the new argument
3528 3531                           * both refer to links to the same existing file,
3529 3532                           * the rename() function shall return successfully
3530 3533                           * and perform no other action."
3531 3534                           */
3532 3535                          ZFS_EXIT(zfsvfs);
3533 3536                          return (0);
3534 3537                  }
3535 3538                  /*
3536 3539                   * If the file system is case-folding, then we may
3537 3540                   * have some more checking to do.  A case-folding file
3538 3541                   * system is either supporting mixed case sensitivity
3539 3542                   * access or is completely case-insensitive.  Note
3540 3543                   * that the file system is always case preserving.
3541 3544                   *
3542 3545                   * In mixed sensitivity mode case sensitive behavior
3543 3546                   * is the default.  FIGNORECASE must be used to
3544 3547                   * explicitly request case insensitive behavior.
3545 3548                   *
3546 3549                   * If the source and target names provided differ only
3547 3550                   * by case (e.g., a request to rename 'tim' to 'Tim'),
3548 3551                   * we will treat this as a special case in the
3549 3552                   * case-insensitive mode: as long as the source name
3550 3553                   * is an exact match, we will allow this to proceed as
3551 3554                   * a name-change request.
3552 3555                   */
3553 3556                  if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3554 3557                      (zfsvfs->z_case == ZFS_CASE_MIXED &&
3555 3558                      flags & FIGNORECASE)) &&
3556 3559                      u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3557 3560                      &error) == 0) {
3558 3561                          /*
3559 3562                           * case preserving rename request, require exact
3560 3563                           * name matches
3561 3564                           */
3562 3565                          zflg |= ZCIEXACT;
3563 3566                          zflg &= ~ZCILOOK;
3564 3567                  }
3565 3568          }
3566 3569  
3567 3570          /*
3568 3571           * If the source and destination directories are the same, we should
3569 3572           * grab the z_name_lock of that directory only once.
3570 3573           */
3571 3574          if (sdzp == tdzp) {
3572 3575                  zflg |= ZHAVELOCK;
3573 3576                  rw_enter(&sdzp->z_name_lock, RW_READER);
3574 3577          }
3575 3578  
3576 3579          if (cmp < 0) {
3577 3580                  serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3578 3581                      ZEXISTS | zflg, NULL, NULL);
3579 3582                  terr = zfs_dirent_lock(&tdl,
3580 3583                      tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3581 3584          } else {
3582 3585                  terr = zfs_dirent_lock(&tdl,
3583 3586                      tdzp, tnm, &tzp, zflg, NULL, NULL);
3584 3587                  serr = zfs_dirent_lock(&sdl,
3585 3588                      sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3586 3589                      NULL, NULL);
3587 3590          }
3588 3591  
3589 3592          if (serr) {
3590 3593                  /*
3591 3594                   * Source entry invalid or not there.
3592 3595                   */
3593 3596                  if (!terr) {
3594 3597                          zfs_dirent_unlock(tdl);
3595 3598                          if (tzp)
3596 3599                                  VN_RELE(ZTOV(tzp));
3597 3600                  }
3598 3601  
3599 3602                  if (sdzp == tdzp)
3600 3603                          rw_exit(&sdzp->z_name_lock);
3601 3604  
3602 3605                  if (strcmp(snm, "..") == 0)
3603 3606                          serr = SET_ERROR(EINVAL);
3604 3607                  ZFS_EXIT(zfsvfs);
3605 3608                  return (serr);
3606 3609          }
3607 3610          if (terr) {
3608 3611                  zfs_dirent_unlock(sdl);
3609 3612                  VN_RELE(ZTOV(szp));
3610 3613  
3611 3614                  if (sdzp == tdzp)
3612 3615                          rw_exit(&sdzp->z_name_lock);
3613 3616  
3614 3617                  if (strcmp(tnm, "..") == 0)
3615 3618                          terr = SET_ERROR(EINVAL);
3616 3619                  ZFS_EXIT(zfsvfs);
3617 3620                  return (terr);
3618 3621          }
3619 3622  
3620 3623          /*
3621 3624           * Must have write access at the source to remove the old entry
3622 3625           * and write access at the target to create the new entry.
3623 3626           * Note that if target and source are the same, this can be
3624 3627           * done in a single check.
3625 3628           */
3626 3629  
3627 3630          if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3628 3631                  goto out;
3629 3632  
3630 3633          if (ZTOV(szp)->v_type == VDIR) {
3631 3634                  /*
3632 3635                   * Check to make sure rename is valid.
3633 3636                   * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3634 3637                   */
3635 3638                  if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3636 3639                          goto out;
3637 3640          }
3638 3641  
3639 3642          /*
3640 3643           * Does target exist?
3641 3644           */
3642 3645          if (tzp) {
3643 3646                  /*
3644 3647                   * Source and target must be the same type.
3645 3648                   */
3646 3649                  if (ZTOV(szp)->v_type == VDIR) {
3647 3650                          if (ZTOV(tzp)->v_type != VDIR) {
3648 3651                                  error = SET_ERROR(ENOTDIR);
3649 3652                                  goto out;
3650 3653                          }
3651 3654                  } else {
3652 3655                          if (ZTOV(tzp)->v_type == VDIR) {
3653 3656                                  error = SET_ERROR(EISDIR);
3654 3657                                  goto out;
3655 3658                          }
3656 3659                  }
3657 3660                  /*
3658 3661                   * POSIX dictates that when the source and target
3659 3662                   * entries refer to the same file object, rename
3660 3663                   * must do nothing and exit without error.
3661 3664                   */
3662 3665                  if (szp->z_id == tzp->z_id) {
3663 3666                          error = 0;
3664 3667                          goto out;
3665 3668                  }
3666 3669          }
3667 3670  
3668 3671          vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
3669 3672          if (tzp)
3670 3673                  vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3671 3674  
3672 3675          /*
3673 3676           * notify the target directory if it is not the same
3674 3677           * as source directory.
3675 3678           */
3676 3679          if (tdvp != sdvp) {
3677 3680                  vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3678 3681          }
3679 3682  
3680 3683          tx = dmu_tx_create(zfsvfs->z_os);
3681 3684          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3682 3685          dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3683 3686          dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3684 3687          dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3685 3688          if (sdzp != tdzp) {
3686 3689                  dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3687 3690                  zfs_sa_upgrade_txholds(tx, tdzp);
3688 3691          }
3689 3692          if (tzp) {
3690 3693                  dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3691 3694                  zfs_sa_upgrade_txholds(tx, tzp);
3692 3695          }
3693 3696  
3694 3697          zfs_sa_upgrade_txholds(tx, szp);
3695 3698          dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3696 3699          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3697 3700          if (error) {
3698 3701                  if (zl != NULL)
3699 3702                          zfs_rename_unlock(&zl);
3700 3703                  zfs_dirent_unlock(sdl);
3701 3704                  zfs_dirent_unlock(tdl);
3702 3705  
3703 3706                  if (sdzp == tdzp)
3704 3707                          rw_exit(&sdzp->z_name_lock);
3705 3708  
3706 3709                  VN_RELE(ZTOV(szp));
3707 3710                  if (tzp)
3708 3711                          VN_RELE(ZTOV(tzp));
3709 3712                  if (error == ERESTART) {
3710 3713                          waited = B_TRUE;
3711 3714                          dmu_tx_wait(tx);
3712 3715                          dmu_tx_abort(tx);
3713 3716                          goto top;
3714 3717                  }
3715 3718                  dmu_tx_abort(tx);
3716 3719                  ZFS_EXIT(zfsvfs);
3717 3720                  return (error);
3718 3721          }
3719 3722  
3720 3723          if (tzp)        /* Attempt to remove the existing target */
3721 3724                  error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3722 3725  
3723 3726          if (error == 0) {
3724 3727                  error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3725 3728                  if (error == 0) {
3726 3729                          szp->z_pflags |= ZFS_AV_MODIFIED;
3727 3730  
3728 3731                          error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3729 3732                              (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3730 3733                          ASSERT0(error);
3731 3734  
3732 3735                          error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3733 3736                          if (error == 0) {
3734 3737                                  zfs_log_rename(zilog, tx, TX_RENAME |
3735 3738                                      (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3736 3739                                      sdl->dl_name, tdzp, tdl->dl_name, szp);
3737 3740  
3738 3741                                  /*
3739 3742                                   * Update path information for the target vnode
3740 3743                                   */
3741 3744                                  vn_renamepath(tdvp, ZTOV(szp), tnm,
3742 3745                                      strlen(tnm));
3743 3746                          } else {
3744 3747                                  /*
3745 3748                                   * At this point, we have successfully created
3746 3749                                   * the target name, but have failed to remove
3747 3750                                   * the source name.  Since the create was done
3748 3751                                   * with the ZRENAMING flag, there are
3749 3752                                   * complications; for one, the link count is
3750 3753                                   * wrong.  The easiest way to deal with this
3751 3754                                   * is to remove the newly created target, and
3752 3755                                   * return the original error.  This must
3753 3756                                   * succeed; fortunately, it is very unlikely to
3754 3757                                   * fail, since we just created it.
3755 3758                                   */
3756 3759                                  VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3757 3760                                      ZRENAMING, NULL), ==, 0);
3758 3761                          }
3759 3762                  }
3760 3763          }
3761 3764  
3762 3765          dmu_tx_commit(tx);
3763 3766  
3764 3767          if (tzp && rm_err == 0)
3765 3768                  vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3766 3769  
3767 3770          if (error == 0) {
3768 3771                  vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3769 3772                  vnevent_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3770 3773          }
3771 3774  out:
3772 3775          if (zl != NULL)
3773 3776                  zfs_rename_unlock(&zl);
3774 3777  
3775 3778          zfs_dirent_unlock(sdl);
3776 3779          zfs_dirent_unlock(tdl);
3777 3780  
3778 3781          if (sdzp == tdzp)
3779 3782                  rw_exit(&sdzp->z_name_lock);
3780 3783  
3781 3784  
3782 3785          VN_RELE(ZTOV(szp));
3783 3786          if (tzp)
3784 3787                  VN_RELE(ZTOV(tzp));
3785 3788  
3786 3789          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3787 3790                  zil_commit(zilog, 0);
3788 3791  
3789 3792          ZFS_EXIT(zfsvfs);
3790 3793          return (error);
3791 3794  }
3792 3795  
3793 3796  /*
3794 3797   * Insert the indicated symbolic reference entry into the directory.
3795 3798   *
3796 3799   *      IN:     dvp     - Directory to contain new symbolic link.
3797 3800   *              link    - Name for new symlink entry.
3798 3801   *              vap     - Attributes of new entry.
3799 3802   *              cr      - credentials of caller.
3800 3803   *              ct      - caller context
3801 3804   *              flags   - case flags
3802 3805   *
3803 3806   *      RETURN: 0 on success, error code on failure.
3804 3807   *
3805 3808   * Timestamps:
3806 3809   *      dvp - ctime|mtime updated
3807 3810   */
3808 3811  /*ARGSUSED*/
3809 3812  static int
3810 3813  zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3811 3814      caller_context_t *ct, int flags)
3812 3815  {
3813 3816          znode_t         *zp, *dzp = VTOZ(dvp);
3814 3817          zfs_dirlock_t   *dl;
3815 3818          dmu_tx_t        *tx;
3816 3819          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
3817 3820          zilog_t         *zilog;
3818 3821          uint64_t        len = strlen(link);
3819 3822          int             error;
3820 3823          int             zflg = ZNEW;
3821 3824          zfs_acl_ids_t   acl_ids;
3822 3825          boolean_t       fuid_dirtied;
3823 3826          uint64_t        txtype = TX_SYMLINK;
3824 3827          boolean_t       waited = B_FALSE;
3825 3828  
3826 3829          ASSERT(vap->va_type == VLNK);
3827 3830  
3828 3831          ZFS_ENTER(zfsvfs);
3829 3832          ZFS_VERIFY_ZP(dzp);
3830 3833          zilog = zfsvfs->z_log;
3831 3834  
3832 3835          if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3833 3836              NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3834 3837                  ZFS_EXIT(zfsvfs);
3835 3838                  return (SET_ERROR(EILSEQ));
3836 3839          }
3837 3840          if (flags & FIGNORECASE)
3838 3841                  zflg |= ZCILOOK;
3839 3842  
3840 3843          if (len > MAXPATHLEN) {
3841 3844                  ZFS_EXIT(zfsvfs);
3842 3845                  return (SET_ERROR(ENAMETOOLONG));
3843 3846          }
3844 3847  
3845 3848          if ((error = zfs_acl_ids_create(dzp, 0,
3846 3849              vap, cr, NULL, &acl_ids)) != 0) {
3847 3850                  ZFS_EXIT(zfsvfs);
3848 3851                  return (error);
3849 3852          }
3850 3853  top:
3851 3854          /*
3852 3855           * Attempt to lock directory; fail if entry already exists.
3853 3856           */
3854 3857          error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3855 3858          if (error) {
3856 3859                  zfs_acl_ids_free(&acl_ids);
3857 3860                  ZFS_EXIT(zfsvfs);
3858 3861                  return (error);
3859 3862          }
3860 3863  
3861 3864          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3862 3865                  zfs_acl_ids_free(&acl_ids);
3863 3866                  zfs_dirent_unlock(dl);
3864 3867                  ZFS_EXIT(zfsvfs);
3865 3868                  return (error);
3866 3869          }
3867 3870  
3868 3871          if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3869 3872                  zfs_acl_ids_free(&acl_ids);
3870 3873                  zfs_dirent_unlock(dl);
3871 3874                  ZFS_EXIT(zfsvfs);
3872 3875                  return (SET_ERROR(EDQUOT));
3873 3876          }
3874 3877          tx = dmu_tx_create(zfsvfs->z_os);
3875 3878          fuid_dirtied = zfsvfs->z_fuid_dirty;
3876 3879          dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3877 3880          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3878 3881          dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3879 3882              ZFS_SA_BASE_ATTR_SIZE + len);
3880 3883          dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3881 3884          if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3882 3885                  dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3883 3886                      acl_ids.z_aclp->z_acl_bytes);
3884 3887          }
3885 3888          if (fuid_dirtied)
3886 3889                  zfs_fuid_txhold(zfsvfs, tx);
3887 3890          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
3888 3891          if (error) {
3889 3892                  zfs_dirent_unlock(dl);
3890 3893                  if (error == ERESTART) {
3891 3894                          waited = B_TRUE;
3892 3895                          dmu_tx_wait(tx);
3893 3896                          dmu_tx_abort(tx);
3894 3897                          goto top;
3895 3898                  }
3896 3899                  zfs_acl_ids_free(&acl_ids);
3897 3900                  dmu_tx_abort(tx);
3898 3901                  ZFS_EXIT(zfsvfs);
3899 3902                  return (error);
3900 3903          }
3901 3904  
3902 3905          /*
3903 3906           * Create a new object for the symlink.
3904 3907           * for version 4 ZPL datsets the symlink will be an SA attribute
3905 3908           */
3906 3909          zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3907 3910  
3908 3911          if (fuid_dirtied)
3909 3912                  zfs_fuid_sync(zfsvfs, tx);
3910 3913  
3911 3914          mutex_enter(&zp->z_lock);
3912 3915          if (zp->z_is_sa)
3913 3916                  error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3914 3917                      link, len, tx);
3915 3918          else
3916 3919                  zfs_sa_symlink(zp, link, len, tx);
3917 3920          mutex_exit(&zp->z_lock);
3918 3921  
3919 3922          zp->z_size = len;
3920 3923          (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3921 3924              &zp->z_size, sizeof (zp->z_size), tx);
3922 3925          /*
3923 3926           * Insert the new object into the directory.
3924 3927           */
3925 3928          (void) zfs_link_create(dl, zp, tx, ZNEW);
3926 3929  
3927 3930          if (flags & FIGNORECASE)
3928 3931                  txtype |= TX_CI;
3929 3932          zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3930 3933  
3931 3934          zfs_acl_ids_free(&acl_ids);
3932 3935  
3933 3936          dmu_tx_commit(tx);
3934 3937  
3935 3938          zfs_dirent_unlock(dl);
3936 3939  
3937 3940          VN_RELE(ZTOV(zp));
3938 3941  
3939 3942          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3940 3943                  zil_commit(zilog, 0);
3941 3944  
3942 3945          ZFS_EXIT(zfsvfs);
3943 3946          return (error);
3944 3947  }
3945 3948  
3946 3949  /*
3947 3950   * Return, in the buffer contained in the provided uio structure,
3948 3951   * the symbolic path referred to by vp.
3949 3952   *
3950 3953   *      IN:     vp      - vnode of symbolic link.
3951 3954   *              uio     - structure to contain the link path.
3952 3955   *              cr      - credentials of caller.
3953 3956   *              ct      - caller context
3954 3957   *
3955 3958   *      OUT:    uio     - structure containing the link path.
3956 3959   *
3957 3960   *      RETURN: 0 on success, error code on failure.
3958 3961   *
3959 3962   * Timestamps:
3960 3963   *      vp - atime updated
3961 3964   */
3962 3965  /* ARGSUSED */
3963 3966  static int
3964 3967  zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3965 3968  {
3966 3969          znode_t         *zp = VTOZ(vp);
3967 3970          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
3968 3971          int             error;
3969 3972  
3970 3973          ZFS_ENTER(zfsvfs);
3971 3974          ZFS_VERIFY_ZP(zp);
3972 3975  
3973 3976          mutex_enter(&zp->z_lock);
3974 3977          if (zp->z_is_sa)
3975 3978                  error = sa_lookup_uio(zp->z_sa_hdl,
3976 3979                      SA_ZPL_SYMLINK(zfsvfs), uio);
3977 3980          else
3978 3981                  error = zfs_sa_readlink(zp, uio);
3979 3982          mutex_exit(&zp->z_lock);
3980 3983  
3981 3984          ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3982 3985  
3983 3986          ZFS_EXIT(zfsvfs);
3984 3987          return (error);
3985 3988  }
3986 3989  
3987 3990  /*
3988 3991   * Insert a new entry into directory tdvp referencing svp.
3989 3992   *
3990 3993   *      IN:     tdvp    - Directory to contain new entry.
3991 3994   *              svp     - vnode of new entry.
3992 3995   *              name    - name of new entry.
3993 3996   *              cr      - credentials of caller.
3994 3997   *              ct      - caller context
3995 3998   *
3996 3999   *      RETURN: 0 on success, error code on failure.
3997 4000   *
3998 4001   * Timestamps:
3999 4002   *      tdvp - ctime|mtime updated
4000 4003   *       svp - ctime updated
4001 4004   */
4002 4005  /* ARGSUSED */
4003 4006  static int
4004 4007  zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4005 4008      caller_context_t *ct, int flags)
4006 4009  {
4007 4010          znode_t         *dzp = VTOZ(tdvp);
4008 4011          znode_t         *tzp, *szp;
4009 4012          zfsvfs_t        *zfsvfs = dzp->z_zfsvfs;
4010 4013          zilog_t         *zilog;
4011 4014          zfs_dirlock_t   *dl;
4012 4015          dmu_tx_t        *tx;
4013 4016          vnode_t         *realvp;
4014 4017          int             error;
4015 4018          int             zf = ZNEW;
4016 4019          uint64_t        parent;
4017 4020          uid_t           owner;
4018 4021          boolean_t       waited = B_FALSE;
4019 4022  
4020 4023          ASSERT(tdvp->v_type == VDIR);
4021 4024  
4022 4025          ZFS_ENTER(zfsvfs);
4023 4026          ZFS_VERIFY_ZP(dzp);
4024 4027          zilog = zfsvfs->z_log;
4025 4028  
4026 4029          if (VOP_REALVP(svp, &realvp, ct) == 0)
4027 4030                  svp = realvp;
4028 4031  
4029 4032          /*
4030 4033           * POSIX dictates that we return EPERM here.
4031 4034           * Better choices include ENOTSUP or EISDIR.
4032 4035           */
4033 4036          if (svp->v_type == VDIR) {
4034 4037                  ZFS_EXIT(zfsvfs);
4035 4038                  return (SET_ERROR(EPERM));
4036 4039          }
4037 4040  
4038 4041          szp = VTOZ(svp);
4039 4042          ZFS_VERIFY_ZP(szp);
4040 4043  
4041 4044          /*
4042 4045           * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4043 4046           * ctldir appear to have the same v_vfsp.
4044 4047           */
4045 4048          if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4046 4049                  ZFS_EXIT(zfsvfs);
4047 4050                  return (SET_ERROR(EXDEV));
4048 4051          }
4049 4052  
4050 4053          /* Prevent links to .zfs/shares files */
4051 4054  
4052 4055          if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4053 4056              &parent, sizeof (uint64_t))) != 0) {
4054 4057                  ZFS_EXIT(zfsvfs);
4055 4058                  return (error);
4056 4059          }
4057 4060          if (parent == zfsvfs->z_shares_dir) {
4058 4061                  ZFS_EXIT(zfsvfs);
4059 4062                  return (SET_ERROR(EPERM));
4060 4063          }
4061 4064  
4062 4065          if (zfsvfs->z_utf8 && u8_validate(name,
4063 4066              strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4064 4067                  ZFS_EXIT(zfsvfs);
4065 4068                  return (SET_ERROR(EILSEQ));
4066 4069          }
4067 4070          if (flags & FIGNORECASE)
4068 4071                  zf |= ZCILOOK;
4069 4072  
4070 4073          /*
4071 4074           * We do not support links between attributes and non-attributes
4072 4075           * because of the potential security risk of creating links
4073 4076           * into "normal" file space in order to circumvent restrictions
4074 4077           * imposed in attribute space.
4075 4078           */
4076 4079          if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4077 4080                  ZFS_EXIT(zfsvfs);
4078 4081                  return (SET_ERROR(EINVAL));
4079 4082          }
4080 4083  
4081 4084  
4082 4085          owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4083 4086          if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4084 4087                  ZFS_EXIT(zfsvfs);
4085 4088                  return (SET_ERROR(EPERM));
4086 4089          }
4087 4090  
4088 4091          if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4089 4092                  ZFS_EXIT(zfsvfs);
4090 4093                  return (error);
4091 4094          }
4092 4095  
4093 4096  top:
4094 4097          /*
4095 4098           * Attempt to lock directory; fail if entry already exists.
4096 4099           */
4097 4100          error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4098 4101          if (error) {
4099 4102                  ZFS_EXIT(zfsvfs);
4100 4103                  return (error);
4101 4104          }
4102 4105  
4103 4106          tx = dmu_tx_create(zfsvfs->z_os);
4104 4107          dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4105 4108          dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4106 4109          zfs_sa_upgrade_txholds(tx, szp);
4107 4110          zfs_sa_upgrade_txholds(tx, dzp);
4108 4111          error = dmu_tx_assign(tx, waited ? TXG_WAITED : TXG_NOWAIT);
4109 4112          if (error) {
4110 4113                  zfs_dirent_unlock(dl);
4111 4114                  if (error == ERESTART) {
4112 4115                          waited = B_TRUE;
4113 4116                          dmu_tx_wait(tx);
4114 4117                          dmu_tx_abort(tx);
4115 4118                          goto top;
4116 4119                  }
4117 4120                  dmu_tx_abort(tx);
4118 4121                  ZFS_EXIT(zfsvfs);
4119 4122                  return (error);
4120 4123          }
4121 4124  
4122 4125          error = zfs_link_create(dl, szp, tx, 0);
4123 4126  
4124 4127          if (error == 0) {
4125 4128                  uint64_t txtype = TX_LINK;
4126 4129                  if (flags & FIGNORECASE)
4127 4130                          txtype |= TX_CI;
4128 4131                  zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4129 4132          }
4130 4133  
4131 4134          dmu_tx_commit(tx);
4132 4135  
4133 4136          zfs_dirent_unlock(dl);
4134 4137  
4135 4138          if (error == 0) {
4136 4139                  vnevent_link(svp, ct);
4137 4140          }
4138 4141  
4139 4142          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4140 4143                  zil_commit(zilog, 0);
4141 4144  
4142 4145          ZFS_EXIT(zfsvfs);
4143 4146          return (error);
4144 4147  }
4145 4148  
4146 4149  /*
4147 4150   * zfs_null_putapage() is used when the file system has been force
4148 4151   * unmounted. It just drops the pages.
4149 4152   */
4150 4153  /* ARGSUSED */
4151 4154  static int
4152 4155  zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4153 4156      size_t *lenp, int flags, cred_t *cr)
4154 4157  {
4155 4158          pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4156 4159          return (0);
4157 4160  }
4158 4161  
4159 4162  /*
4160 4163   * Push a page out to disk, klustering if possible.
4161 4164   *
4162 4165   *      IN:     vp      - file to push page to.
4163 4166   *              pp      - page to push.
4164 4167   *              flags   - additional flags.
4165 4168   *              cr      - credentials of caller.
4166 4169   *
4167 4170   *      OUT:    offp    - start of range pushed.
4168 4171   *              lenp    - len of range pushed.
4169 4172   *
4170 4173   *      RETURN: 0 on success, error code on failure.
4171 4174   *
4172 4175   * NOTE: callers must have locked the page to be pushed.  On
4173 4176   * exit, the page (and all other pages in the kluster) must be
4174 4177   * unlocked.
4175 4178   */
4176 4179  /* ARGSUSED */
4177 4180  static int
4178 4181  zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4179 4182      size_t *lenp, int flags, cred_t *cr)
4180 4183  {
4181 4184          znode_t         *zp = VTOZ(vp);
4182 4185          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4183 4186          dmu_tx_t        *tx;
4184 4187          u_offset_t      off, koff;
4185 4188          size_t          len, klen;
4186 4189          int             err;
4187 4190  
4188 4191          off = pp->p_offset;
4189 4192          len = PAGESIZE;
4190 4193          /*
4191 4194           * If our blocksize is bigger than the page size, try to kluster
4192 4195           * multiple pages so that we write a full block (thus avoiding
4193 4196           * a read-modify-write).
4194 4197           */
4195 4198          if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4196 4199                  klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4197 4200                  koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4198 4201                  ASSERT(koff <= zp->z_size);
4199 4202                  if (koff + klen > zp->z_size)
4200 4203                          klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4201 4204                  pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4202 4205          }
4203 4206          ASSERT3U(btop(len), ==, btopr(len));
4204 4207  
4205 4208          /*
4206 4209           * Can't push pages past end-of-file.
4207 4210           */
4208 4211          if (off >= zp->z_size) {
4209 4212                  /* ignore all pages */
4210 4213                  err = 0;
4211 4214                  goto out;
4212 4215          } else if (off + len > zp->z_size) {
4213 4216                  int npages = btopr(zp->z_size - off);
4214 4217                  page_t *trunc;
4215 4218  
4216 4219                  page_list_break(&pp, &trunc, npages);
4217 4220                  /* ignore pages past end of file */
4218 4221                  if (trunc)
4219 4222                          pvn_write_done(trunc, flags);
4220 4223                  len = zp->z_size - off;
4221 4224          }
4222 4225  
4223 4226          if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4224 4227              zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4225 4228                  err = SET_ERROR(EDQUOT);
4226 4229                  goto out;
4227 4230          }
4228 4231          tx = dmu_tx_create(zfsvfs->z_os);
4229 4232          dmu_tx_hold_write(tx, zp->z_id, off, len);
4230 4233  
4231 4234          dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4232 4235          zfs_sa_upgrade_txholds(tx, zp);
4233 4236          err = dmu_tx_assign(tx, TXG_WAIT);
4234 4237          if (err != 0) {
4235 4238                  dmu_tx_abort(tx);
4236 4239                  goto out;
4237 4240          }
4238 4241  
4239 4242          if (zp->z_blksz <= PAGESIZE) {
4240 4243                  caddr_t va = zfs_map_page(pp, S_READ);
4241 4244                  ASSERT3U(len, <=, PAGESIZE);
4242 4245                  dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4243 4246                  zfs_unmap_page(pp, va);
4244 4247          } else {
4245 4248                  err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4246 4249          }
4247 4250  
4248 4251          if (err == 0) {
4249 4252                  uint64_t mtime[2], ctime[2];
4250 4253                  sa_bulk_attr_t bulk[3];

↓ open down ↓

3246 lines elided

↑ open up ↑

4251 4254                  int count = 0;
4252 4255  
4253 4256                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4254 4257                      &mtime, 16);
4255 4258                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4256 4259                      &ctime, 16);
4257 4260                  SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4258 4261                      &zp->z_pflags, 8);
4259 4262                  zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4260 4263                      B_TRUE);
4261      -                err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4262      -
4263 4264                  zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4264 4265          }
4265 4266          dmu_tx_commit(tx);
4266 4267  
4267 4268  out:
4268 4269          pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4269 4270          if (offp)
4270 4271                  *offp = off;
4271 4272          if (lenp)
4272 4273                  *lenp = len;

4273 4274  
4274 4275          return (err);
4275 4276  }
4276 4277  
4277 4278  /*
4278 4279   * Copy the portion of the file indicated from pages into the file.
4279 4280   * The pages are stored in a page list attached to the files vnode.
4280 4281   *
4281 4282   *      IN:     vp      - vnode of file to push page data to.
4282 4283   *              off     - position in file to put data.
4283 4284   *              len     - amount of data to write.
4284 4285   *              flags   - flags to control the operation.
4285 4286   *              cr      - credentials of caller.
4286 4287   *              ct      - caller context.
4287 4288   *
4288 4289   *      RETURN: 0 on success, error code on failure.
4289 4290   *
4290 4291   * Timestamps:
4291 4292   *      vp - ctime|mtime updated
4292 4293   */
4293 4294  /*ARGSUSED*/
4294 4295  static int
4295 4296  zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4296 4297      caller_context_t *ct)
4297 4298  {
4298 4299          znode_t         *zp = VTOZ(vp);
4299 4300          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4300 4301          page_t          *pp;
4301 4302          size_t          io_len;
4302 4303          u_offset_t      io_off;
4303 4304          uint_t          blksz;
4304 4305          rl_t            *rl;
4305 4306          int             error = 0;
4306 4307  
4307 4308          ZFS_ENTER(zfsvfs);
4308 4309          ZFS_VERIFY_ZP(zp);
4309 4310  
4310 4311          /*
4311 4312           * There's nothing to do if no data is cached.
4312 4313           */
4313 4314          if (!vn_has_cached_data(vp)) {
4314 4315                  ZFS_EXIT(zfsvfs);
4315 4316                  return (0);
4316 4317          }
4317 4318  
4318 4319          /*
4319 4320           * Align this request to the file block size in case we kluster.
4320 4321           * XXX - this can result in pretty aggresive locking, which can
4321 4322           * impact simultanious read/write access.  One option might be
4322 4323           * to break up long requests (len == 0) into block-by-block
4323 4324           * operations to get narrower locking.
4324 4325           */
4325 4326          blksz = zp->z_blksz;
4326 4327          if (ISP2(blksz))
4327 4328                  io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4328 4329          else
4329 4330                  io_off = 0;
4330 4331          if (len > 0 && ISP2(blksz))
4331 4332                  io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4332 4333          else
4333 4334                  io_len = 0;
4334 4335  
4335 4336          if (io_len == 0) {
4336 4337                  /*
4337 4338                   * Search the entire vp list for pages >= io_off.
4338 4339                   */
4339 4340                  rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
4340 4341                  error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4341 4342                  goto out;
4342 4343          }
4343 4344          rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
4344 4345  
4345 4346          if (off > zp->z_size) {
4346 4347                  /* past end of file */
4347 4348                  zfs_range_unlock(rl);
4348 4349                  ZFS_EXIT(zfsvfs);
4349 4350                  return (0);
4350 4351          }
4351 4352  
4352 4353          len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4353 4354  
4354 4355          for (off = io_off; io_off < off + len; io_off += io_len) {
4355 4356                  if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4356 4357                          pp = page_lookup(vp, io_off,
4357 4358                              (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4358 4359                  } else {
4359 4360                          pp = page_lookup_nowait(vp, io_off,
4360 4361                              (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4361 4362                  }
4362 4363  
4363 4364                  if (pp != NULL && pvn_getdirty(pp, flags)) {
4364 4365                          int err;
4365 4366  
4366 4367                          /*
4367 4368                           * Found a dirty page to push
4368 4369                           */
4369 4370                          err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4370 4371                          if (err)
4371 4372                                  error = err;
4372 4373                  } else {
4373 4374                          io_len = PAGESIZE;
4374 4375                  }
4375 4376          }
4376 4377  out:
4377 4378          zfs_range_unlock(rl);
4378 4379          if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4379 4380                  zil_commit(zfsvfs->z_log, zp->z_id);
4380 4381          ZFS_EXIT(zfsvfs);
4381 4382          return (error);
4382 4383  }
4383 4384  
4384 4385  /*ARGSUSED*/
4385 4386  void
4386 4387  zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4387 4388  {
4388 4389          znode_t *zp = VTOZ(vp);
4389 4390          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4390 4391          int error;
4391 4392  
4392 4393          rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4393 4394          if (zp->z_sa_hdl == NULL) {
4394 4395                  /*
4395 4396                   * The fs has been unmounted, or we did a
4396 4397                   * suspend/resume and this file no longer exists.
4397 4398                   */
4398 4399                  if (vn_has_cached_data(vp)) {
4399 4400                          (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4400 4401                              B_INVAL, cr);
4401 4402                  }
4402 4403  
4403 4404                  mutex_enter(&zp->z_lock);
4404 4405                  mutex_enter(&vp->v_lock);
4405 4406                  ASSERT(vp->v_count == 1);
4406 4407                  vp->v_count = 0;
4407 4408                  mutex_exit(&vp->v_lock);
4408 4409                  mutex_exit(&zp->z_lock);
4409 4410                  rw_exit(&zfsvfs->z_teardown_inactive_lock);
4410 4411                  zfs_znode_free(zp);
4411 4412                  return;
4412 4413          }
4413 4414  
4414 4415          /*
4415 4416           * Attempt to push any data in the page cache.  If this fails
4416 4417           * we will get kicked out later in zfs_zinactive().
4417 4418           */
4418 4419          if (vn_has_cached_data(vp)) {
4419 4420                  (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4420 4421                      cr);
4421 4422          }
4422 4423  
4423 4424          if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4424 4425                  dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4425 4426  
4426 4427                  dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4427 4428                  zfs_sa_upgrade_txholds(tx, zp);
4428 4429                  error = dmu_tx_assign(tx, TXG_WAIT);
4429 4430                  if (error) {
4430 4431                          dmu_tx_abort(tx);
4431 4432                  } else {
4432 4433                          mutex_enter(&zp->z_lock);
4433 4434                          (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4434 4435                              (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4435 4436                          zp->z_atime_dirty = 0;
4436 4437                          mutex_exit(&zp->z_lock);
4437 4438                          dmu_tx_commit(tx);
4438 4439                  }
4439 4440          }
4440 4441  
4441 4442          zfs_zinactive(zp);
4442 4443          rw_exit(&zfsvfs->z_teardown_inactive_lock);
4443 4444  }
4444 4445  
4445 4446  /*
4446 4447   * Bounds-check the seek operation.
4447 4448   *
4448 4449   *      IN:     vp      - vnode seeking within
4449 4450   *              ooff    - old file offset
4450 4451   *              noffp   - pointer to new file offset
4451 4452   *              ct      - caller context
4452 4453   *
4453 4454   *      RETURN: 0 on success, EINVAL if new offset invalid.
4454 4455   */
4455 4456  /* ARGSUSED */
4456 4457  static int
4457 4458  zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4458 4459      caller_context_t *ct)
4459 4460  {
4460 4461          if (vp->v_type == VDIR)
4461 4462                  return (0);
4462 4463          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4463 4464  }
4464 4465  
4465 4466  /*
4466 4467   * Pre-filter the generic locking function to trap attempts to place
4467 4468   * a mandatory lock on a memory mapped file.
4468 4469   */
4469 4470  static int
4470 4471  zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4471 4472      flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4472 4473  {
4473 4474          znode_t *zp = VTOZ(vp);
4474 4475          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4475 4476  
4476 4477          ZFS_ENTER(zfsvfs);
4477 4478          ZFS_VERIFY_ZP(zp);
4478 4479  
4479 4480          /*
4480 4481           * We are following the UFS semantics with respect to mapcnt
4481 4482           * here: If we see that the file is mapped already, then we will
4482 4483           * return an error, but we don't worry about races between this
4483 4484           * function and zfs_map().
4484 4485           */
4485 4486          if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4486 4487                  ZFS_EXIT(zfsvfs);
4487 4488                  return (SET_ERROR(EAGAIN));
4488 4489          }
4489 4490          ZFS_EXIT(zfsvfs);
4490 4491          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4491 4492  }
4492 4493  
4493 4494  /*
4494 4495   * If we can't find a page in the cache, we will create a new page
4495 4496   * and fill it with file data.  For efficiency, we may try to fill
4496 4497   * multiple pages at once (klustering) to fill up the supplied page
4497 4498   * list.  Note that the pages to be filled are held with an exclusive
4498 4499   * lock to prevent access by other threads while they are being filled.
4499 4500   */
4500 4501  static int
4501 4502  zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4502 4503      caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4503 4504  {
4504 4505          znode_t *zp = VTOZ(vp);
4505 4506          page_t *pp, *cur_pp;
4506 4507          objset_t *os = zp->z_zfsvfs->z_os;
4507 4508          u_offset_t io_off, total;
4508 4509          size_t io_len;
4509 4510          int err;
4510 4511  
4511 4512          if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4512 4513                  /*
4513 4514                   * We only have a single page, don't bother klustering
4514 4515                   */
4515 4516                  io_off = off;
4516 4517                  io_len = PAGESIZE;
4517 4518                  pp = page_create_va(vp, io_off, io_len,
4518 4519                      PG_EXCL | PG_WAIT, seg, addr);
4519 4520          } else {
4520 4521                  /*
4521 4522                   * Try to find enough pages to fill the page list
4522 4523                   */
4523 4524                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4524 4525                      &io_len, off, plsz, 0);
4525 4526          }
4526 4527          if (pp == NULL) {
4527 4528                  /*
4528 4529                   * The page already exists, nothing to do here.
4529 4530                   */
4530 4531                  *pl = NULL;
4531 4532                  return (0);
4532 4533          }
4533 4534  
4534 4535          /*
4535 4536           * Fill the pages in the kluster.
4536 4537           */
4537 4538          cur_pp = pp;
4538 4539          for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4539 4540                  caddr_t va;
4540 4541  
4541 4542                  ASSERT3U(io_off, ==, cur_pp->p_offset);
4542 4543                  va = zfs_map_page(cur_pp, S_WRITE);
4543 4544                  err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4544 4545                      DMU_READ_PREFETCH);
4545 4546                  zfs_unmap_page(cur_pp, va);
4546 4547                  if (err) {
4547 4548                          /* On error, toss the entire kluster */
4548 4549                          pvn_read_done(pp, B_ERROR);
4549 4550                          /* convert checksum errors into IO errors */
4550 4551                          if (err == ECKSUM)
4551 4552                                  err = SET_ERROR(EIO);
4552 4553                          return (err);
4553 4554                  }
4554 4555                  cur_pp = cur_pp->p_next;
4555 4556          }
4556 4557  
4557 4558          /*
4558 4559           * Fill in the page list array from the kluster starting
4559 4560           * from the desired offset `off'.
4560 4561           * NOTE: the page list will always be null terminated.
4561 4562           */
4562 4563          pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4563 4564          ASSERT(pl == NULL || (*pl)->p_offset == off);
4564 4565  
4565 4566          return (0);
4566 4567  }
4567 4568  
4568 4569  /*
4569 4570   * Return pointers to the pages for the file region [off, off + len]
4570 4571   * in the pl array.  If plsz is greater than len, this function may
4571 4572   * also return page pointers from after the specified region
4572 4573   * (i.e. the region [off, off + plsz]).  These additional pages are
4573 4574   * only returned if they are already in the cache, or were created as
4574 4575   * part of a klustered read.
4575 4576   *
4576 4577   *      IN:     vp      - vnode of file to get data from.
4577 4578   *              off     - position in file to get data from.
4578 4579   *              len     - amount of data to retrieve.
4579 4580   *              plsz    - length of provided page list.
4580 4581   *              seg     - segment to obtain pages for.
4581 4582   *              addr    - virtual address of fault.
4582 4583   *              rw      - mode of created pages.
4583 4584   *              cr      - credentials of caller.
4584 4585   *              ct      - caller context.
4585 4586   *
4586 4587   *      OUT:    protp   - protection mode of created pages.
4587 4588   *              pl      - list of pages created.
4588 4589   *
4589 4590   *      RETURN: 0 on success, error code on failure.
4590 4591   *
4591 4592   * Timestamps:
4592 4593   *      vp - atime updated
4593 4594   */
4594 4595  /* ARGSUSED */
4595 4596  static int
4596 4597  zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4597 4598      page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4598 4599      enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4599 4600  {
4600 4601          znode_t         *zp = VTOZ(vp);
4601 4602          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4602 4603          page_t          **pl0 = pl;
4603 4604          int             err = 0;
4604 4605  
4605 4606          /* we do our own caching, faultahead is unnecessary */
4606 4607          if (pl == NULL)
4607 4608                  return (0);
4608 4609          else if (len > plsz)
4609 4610                  len = plsz;
4610 4611          else
4611 4612                  len = P2ROUNDUP(len, PAGESIZE);
4612 4613          ASSERT(plsz >= len);
4613 4614  
4614 4615          ZFS_ENTER(zfsvfs);
4615 4616          ZFS_VERIFY_ZP(zp);
4616 4617  
4617 4618          if (protp)
4618 4619                  *protp = PROT_ALL;
4619 4620  
4620 4621          /*
4621 4622           * Loop through the requested range [off, off + len) looking
4622 4623           * for pages.  If we don't find a page, we will need to create
4623 4624           * a new page and fill it with data from the file.
4624 4625           */
4625 4626          while (len > 0) {
4626 4627                  if (*pl = page_lookup(vp, off, SE_SHARED))
4627 4628                          *(pl+1) = NULL;
4628 4629                  else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4629 4630                          goto out;
4630 4631                  while (*pl) {
4631 4632                          ASSERT3U((*pl)->p_offset, ==, off);
4632 4633                          off += PAGESIZE;
4633 4634                          addr += PAGESIZE;
4634 4635                          if (len > 0) {
4635 4636                                  ASSERT3U(len, >=, PAGESIZE);
4636 4637                                  len -= PAGESIZE;
4637 4638                          }
4638 4639                          ASSERT3U(plsz, >=, PAGESIZE);
4639 4640                          plsz -= PAGESIZE;
4640 4641                          pl++;
4641 4642                  }
4642 4643          }
4643 4644  
4644 4645          /*
4645 4646           * Fill out the page array with any pages already in the cache.
4646 4647           */
4647 4648          while (plsz > 0 &&
4648 4649              (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4649 4650                          off += PAGESIZE;
4650 4651                          plsz -= PAGESIZE;
4651 4652          }
4652 4653  out:
4653 4654          if (err) {
4654 4655                  /*
4655 4656                   * Release any pages we have previously locked.
4656 4657                   */
4657 4658                  while (pl > pl0)
4658 4659                          page_unlock(*--pl);
4659 4660          } else {
4660 4661                  ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4661 4662          }
4662 4663  
4663 4664          *pl = NULL;
4664 4665  
4665 4666          ZFS_EXIT(zfsvfs);
4666 4667          return (err);
4667 4668  }
4668 4669  
4669 4670  /*
4670 4671   * Request a memory map for a section of a file.  This code interacts
4671 4672   * with common code and the VM system as follows:
4672 4673   *
4673 4674   * - common code calls mmap(), which ends up in smmap_common()
4674 4675   * - this calls VOP_MAP(), which takes you into (say) zfs
4675 4676   * - zfs_map() calls as_map(), passing segvn_create() as the callback
4676 4677   * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4677 4678   * - zfs_addmap() updates z_mapcnt
4678 4679   */
4679 4680  /*ARGSUSED*/
4680 4681  static int
4681 4682  zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4682 4683      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4683 4684      caller_context_t *ct)
4684 4685  {
4685 4686          znode_t *zp = VTOZ(vp);
4686 4687          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4687 4688          segvn_crargs_t  vn_a;
4688 4689          int             error;
4689 4690  
4690 4691          ZFS_ENTER(zfsvfs);
4691 4692          ZFS_VERIFY_ZP(zp);
4692 4693  
4693 4694          if ((prot & PROT_WRITE) && (zp->z_pflags &
4694 4695              (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) {
4695 4696                  ZFS_EXIT(zfsvfs);
4696 4697                  return (SET_ERROR(EPERM));
4697 4698          }
4698 4699  
4699 4700          if ((prot & (PROT_READ | PROT_EXEC)) &&
4700 4701              (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4701 4702                  ZFS_EXIT(zfsvfs);
4702 4703                  return (SET_ERROR(EACCES));
4703 4704          }
4704 4705  
4705 4706          if (vp->v_flag & VNOMAP) {
4706 4707                  ZFS_EXIT(zfsvfs);
4707 4708                  return (SET_ERROR(ENOSYS));
4708 4709          }
4709 4710  
4710 4711          if (off < 0 || len > MAXOFFSET_T - off) {
4711 4712                  ZFS_EXIT(zfsvfs);
4712 4713                  return (SET_ERROR(ENXIO));
4713 4714          }
4714 4715  
4715 4716          if (vp->v_type != VREG) {
4716 4717                  ZFS_EXIT(zfsvfs);
4717 4718                  return (SET_ERROR(ENODEV));
4718 4719          }
4719 4720  
4720 4721          /*
4721 4722           * If file is locked, disallow mapping.
4722 4723           */
4723 4724          if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4724 4725                  ZFS_EXIT(zfsvfs);
4725 4726                  return (SET_ERROR(EAGAIN));
4726 4727          }
4727 4728  
4728 4729          as_rangelock(as);
4729 4730          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4730 4731          if (error != 0) {
4731 4732                  as_rangeunlock(as);
4732 4733                  ZFS_EXIT(zfsvfs);
4733 4734                  return (error);
4734 4735          }
4735 4736  
4736 4737          vn_a.vp = vp;
4737 4738          vn_a.offset = (u_offset_t)off;
4738 4739          vn_a.type = flags & MAP_TYPE;
4739 4740          vn_a.prot = prot;
4740 4741          vn_a.maxprot = maxprot;
4741 4742          vn_a.cred = cr;
4742 4743          vn_a.amp = NULL;
4743 4744          vn_a.flags = flags & ~MAP_TYPE;
4744 4745          vn_a.szc = 0;
4745 4746          vn_a.lgrp_mem_policy_flags = 0;
4746 4747  
4747 4748          error = as_map(as, *addrp, len, segvn_create, &vn_a);
4748 4749  
4749 4750          as_rangeunlock(as);
4750 4751          ZFS_EXIT(zfsvfs);
4751 4752          return (error);
4752 4753  }
4753 4754  
4754 4755  /* ARGSUSED */
4755 4756  static int
4756 4757  zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4757 4758      size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4758 4759      caller_context_t *ct)
4759 4760  {
4760 4761          uint64_t pages = btopr(len);
4761 4762  
4762 4763          atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4763 4764          return (0);
4764 4765  }
4765 4766  
4766 4767  /*
4767 4768   * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4768 4769   * more accurate mtime for the associated file.  Since we don't have a way of
4769 4770   * detecting when the data was actually modified, we have to resort to
4770 4771   * heuristics.  If an explicit msync() is done, then we mark the mtime when the
4771 4772   * last page is pushed.  The problem occurs when the msync() call is omitted,
4772 4773   * which by far the most common case:
4773 4774   *
4774 4775   *      open()
4775 4776   *      mmap()
4776 4777   *      <modify memory>
4777 4778   *      munmap()
4778 4779   *      close()
4779 4780   *      <time lapse>
4780 4781   *      putpage() via fsflush
4781 4782   *
4782 4783   * If we wait until fsflush to come along, we can have a modification time that
4783 4784   * is some arbitrary point in the future.  In order to prevent this in the
4784 4785   * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4785 4786   * torn down.
4786 4787   */
4787 4788  /* ARGSUSED */

↓ open down ↓

515 lines elided

↑ open up ↑

4788 4789  static int
4789 4790  zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4790 4791      size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4791 4792      caller_context_t *ct)
4792 4793  {
4793 4794          uint64_t pages = btopr(len);
4794 4795  
4795 4796          ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4796 4797          atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4797 4798  
     4799 +        if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
     4800 +            vn_has_cached_data(vp))
     4801 +                (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
     4802 +
4798 4803          return (0);
4799 4804  }
4800 4805  
4801 4806  /*
4802 4807   * Free or allocate space in a file.  Currently, this function only
4803 4808   * supports the `F_FREESP' command.  However, this command is somewhat
4804 4809   * misnamed, as its functionality includes the ability to allocate as
4805 4810   * well as free space.
4806 4811   *
4807 4812   *      IN:     vp      - vnode of file to free data in.

4808 4813   *              cmd     - action to take (only F_FREESP supported).
4809 4814   *              bfp     - section of file to free/alloc.
4810 4815   *              flag    - current file open mode flags.
4811 4816   *              offset  - current file offset.
4812 4817   *              cr      - credentials of caller [UNUSED].
4813 4818   *              ct      - caller context.
4814 4819   *
4815 4820   *      RETURN: 0 on success, error code on failure.
4816 4821   *
4817 4822   * Timestamps:
4818 4823   *      vp - ctime|mtime updated
4819 4824   */
4820 4825  /* ARGSUSED */
4821 4826  static int
4822 4827  zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4823 4828      offset_t offset, cred_t *cr, caller_context_t *ct)
4824 4829  {
4825 4830          znode_t         *zp = VTOZ(vp);
4826 4831          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4827 4832          uint64_t        off, len;
4828 4833          int             error;
4829 4834  
4830 4835          ZFS_ENTER(zfsvfs);
4831 4836          ZFS_VERIFY_ZP(zp);
4832 4837  
4833 4838          if (cmd != F_FREESP) {
4834 4839                  ZFS_EXIT(zfsvfs);
4835 4840                  return (SET_ERROR(EINVAL));
4836 4841          }
4837 4842  
4838 4843          /*
4839 4844           * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
4840 4845           * callers might not be able to detect properly that we are read-only,
4841 4846           * so check it explicitly here.
4842 4847           */
4843 4848          if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
4844 4849                  ZFS_EXIT(zfsvfs);
4845 4850                  return (SET_ERROR(EROFS));
4846 4851          }
4847 4852  
4848 4853          if (error = convoff(vp, bfp, 0, offset)) {
4849 4854                  ZFS_EXIT(zfsvfs);
4850 4855                  return (error);
4851 4856          }
4852 4857  
4853 4858          if (bfp->l_len < 0) {
4854 4859                  ZFS_EXIT(zfsvfs);
4855 4860                  return (SET_ERROR(EINVAL));
4856 4861          }
4857 4862  
4858 4863          off = bfp->l_start;
4859 4864          len = bfp->l_len; /* 0 means from off to end of file */
4860 4865  
4861 4866          error = zfs_freesp(zp, off, len, flag, TRUE);
4862 4867  
4863 4868          if (error == 0 && len == 0) {
4864 4869                  if (off == 0) {
4865 4870                          vnevent_truncate(ZTOV(zp), ct);
4866 4871                  } else {
4867 4872                          vnevent_resize(ZTOV(zp), ct);
4868 4873                  }
4869 4874          }
4870 4875  
4871 4876          ZFS_EXIT(zfsvfs);
4872 4877          return (error);
4873 4878  }
4874 4879  
4875 4880  /*ARGSUSED*/
4876 4881  static int
4877 4882  zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4878 4883  {
4879 4884          znode_t         *zp = VTOZ(vp);
4880 4885          zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
4881 4886          uint32_t        gen;
4882 4887          uint64_t        gen64;
4883 4888          uint64_t        object = zp->z_id;
4884 4889          zfid_short_t    *zfid;
4885 4890          int             size, i, error;
4886 4891  
4887 4892          ZFS_ENTER(zfsvfs);
4888 4893          ZFS_VERIFY_ZP(zp);
4889 4894  
4890 4895          if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4891 4896              &gen64, sizeof (uint64_t))) != 0) {
4892 4897                  ZFS_EXIT(zfsvfs);
4893 4898                  return (error);
4894 4899          }
4895 4900  
4896 4901          gen = (uint32_t)gen64;
4897 4902  
4898 4903          size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4899 4904          if (fidp->fid_len < size) {
4900 4905                  fidp->fid_len = size;
4901 4906                  ZFS_EXIT(zfsvfs);
4902 4907                  return (SET_ERROR(ENOSPC));
4903 4908          }
4904 4909  
4905 4910          zfid = (zfid_short_t *)fidp;
4906 4911  
4907 4912          zfid->zf_len = size;
4908 4913  
4909 4914          for (i = 0; i < sizeof (zfid->zf_object); i++)
4910 4915                  zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4911 4916  
4912 4917          /* Must have a non-zero generation number to distinguish from .zfs */
4913 4918          if (gen == 0)
4914 4919                  gen = 1;
4915 4920          for (i = 0; i < sizeof (zfid->zf_gen); i++)
4916 4921                  zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4917 4922  
4918 4923          if (size == LONG_FID_LEN) {
4919 4924                  uint64_t        objsetid = dmu_objset_id(zfsvfs->z_os);
4920 4925                  zfid_long_t     *zlfid;
4921 4926  
4922 4927                  zlfid = (zfid_long_t *)fidp;
4923 4928  
4924 4929                  for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4925 4930                          zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4926 4931  
4927 4932                  /* XXX - this should be the generation number for the objset */
4928 4933                  for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4929 4934                          zlfid->zf_setgen[i] = 0;
4930 4935          }
4931 4936  
4932 4937          ZFS_EXIT(zfsvfs);
4933 4938          return (0);
4934 4939  }
4935 4940  
4936 4941  static int
4937 4942  zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4938 4943      caller_context_t *ct)
4939 4944  {
4940 4945          znode_t         *zp, *xzp;
4941 4946          zfsvfs_t        *zfsvfs;
4942 4947          zfs_dirlock_t   *dl;
4943 4948          int             error;
4944 4949  
4945 4950          switch (cmd) {
4946 4951          case _PC_LINK_MAX:
4947 4952                  *valp = ULONG_MAX;
4948 4953                  return (0);
4949 4954  
4950 4955          case _PC_FILESIZEBITS:
4951 4956                  *valp = 64;
4952 4957                  return (0);
4953 4958  
4954 4959          case _PC_XATTR_EXISTS:
4955 4960                  zp = VTOZ(vp);
4956 4961                  zfsvfs = zp->z_zfsvfs;
4957 4962                  ZFS_ENTER(zfsvfs);
4958 4963                  ZFS_VERIFY_ZP(zp);
4959 4964                  *valp = 0;
4960 4965                  error = zfs_dirent_lock(&dl, zp, "", &xzp,
4961 4966                      ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4962 4967                  if (error == 0) {
4963 4968                          zfs_dirent_unlock(dl);
4964 4969                          if (!zfs_dirempty(xzp))
4965 4970                                  *valp = 1;
4966 4971                          VN_RELE(ZTOV(xzp));
4967 4972                  } else if (error == ENOENT) {
4968 4973                          /*
4969 4974                           * If there aren't extended attributes, it's the
4970 4975                           * same as having zero of them.
4971 4976                           */
4972 4977                          error = 0;
4973 4978                  }
4974 4979                  ZFS_EXIT(zfsvfs);
4975 4980                  return (error);
4976 4981  
4977 4982          case _PC_SATTR_ENABLED:
4978 4983          case _PC_SATTR_EXISTS:
4979 4984                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4980 4985                      (vp->v_type == VREG || vp->v_type == VDIR);
4981 4986                  return (0);
4982 4987  
4983 4988          case _PC_ACCESS_FILTERING:
4984 4989                  *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
4985 4990                      vp->v_type == VDIR;
4986 4991                  return (0);
4987 4992  
4988 4993          case _PC_ACL_ENABLED:
4989 4994                  *valp = _ACL_ACE_ENABLED;
4990 4995                  return (0);
4991 4996  
4992 4997          case _PC_MIN_HOLE_SIZE:
4993 4998                  *valp = (ulong_t)SPA_MINBLOCKSIZE;
4994 4999                  return (0);
4995 5000  
4996 5001          case _PC_TIMESTAMP_RESOLUTION:
4997 5002                  /* nanosecond timestamp resolution */
4998 5003                  *valp = 1L;
4999 5004                  return (0);
5000 5005  
5001 5006          default:
5002 5007                  return (fs_pathconf(vp, cmd, valp, cr, ct));
5003 5008          }
5004 5009  }
5005 5010  
5006 5011  /*ARGSUSED*/
5007 5012  static int
5008 5013  zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5009 5014      caller_context_t *ct)
5010 5015  {
5011 5016          znode_t *zp = VTOZ(vp);
5012 5017          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5013 5018          int error;
5014 5019          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5015 5020  
5016 5021          ZFS_ENTER(zfsvfs);
5017 5022          ZFS_VERIFY_ZP(zp);
5018 5023          error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5019 5024          ZFS_EXIT(zfsvfs);
5020 5025  
5021 5026          return (error);
5022 5027  }
5023 5028  
5024 5029  /*ARGSUSED*/
5025 5030  static int
5026 5031  zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5027 5032      caller_context_t *ct)
5028 5033  {
5029 5034          znode_t *zp = VTOZ(vp);
5030 5035          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5031 5036          int error;
5032 5037          boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5033 5038          zilog_t *zilog = zfsvfs->z_log;
5034 5039  
5035 5040          ZFS_ENTER(zfsvfs);
5036 5041          ZFS_VERIFY_ZP(zp);
5037 5042  
5038 5043          error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5039 5044  
5040 5045          if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5041 5046                  zil_commit(zilog, 0);
5042 5047  
5043 5048          ZFS_EXIT(zfsvfs);
5044 5049          return (error);
5045 5050  }
5046 5051  
5047 5052  /*
5048 5053   * The smallest read we may consider to loan out an arcbuf.
5049 5054   * This must be a power of 2.
5050 5055   */
5051 5056  int zcr_blksz_min = (1 << 10);  /* 1K */
5052 5057  /*
5053 5058   * If set to less than the file block size, allow loaning out of an
5054 5059   * arcbuf for a partial block read.  This must be a power of 2.
5055 5060   */
5056 5061  int zcr_blksz_max = (1 << 17);  /* 128K */
5057 5062  
5058 5063  /*ARGSUSED*/
5059 5064  static int
5060 5065  zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5061 5066      caller_context_t *ct)
5062 5067  {
5063 5068          znode_t *zp = VTOZ(vp);
5064 5069          zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5065 5070          int max_blksz = zfsvfs->z_max_blksz;
5066 5071          uio_t *uio = &xuio->xu_uio;
5067 5072          ssize_t size = uio->uio_resid;
5068 5073          offset_t offset = uio->uio_loffset;
5069 5074          int blksz;
5070 5075          int fullblk, i;
5071 5076          arc_buf_t *abuf;
5072 5077          ssize_t maxsize;
5073 5078          int preamble, postamble;
5074 5079  
5075 5080          if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5076 5081                  return (SET_ERROR(EINVAL));
5077 5082  
5078 5083          ZFS_ENTER(zfsvfs);
5079 5084          ZFS_VERIFY_ZP(zp);
5080 5085          switch (ioflag) {
5081 5086          case UIO_WRITE:
5082 5087                  /*
5083 5088                   * Loan out an arc_buf for write if write size is bigger than
5084 5089                   * max_blksz, and the file's block size is also max_blksz.
5085 5090                   */
5086 5091                  blksz = max_blksz;
5087 5092                  if (size < blksz || zp->z_blksz != blksz) {
5088 5093                          ZFS_EXIT(zfsvfs);
5089 5094                          return (SET_ERROR(EINVAL));
5090 5095                  }
5091 5096                  /*
5092 5097                   * Caller requests buffers for write before knowing where the
5093 5098                   * write offset might be (e.g. NFS TCP write).
5094 5099                   */
5095 5100                  if (offset == -1) {
5096 5101                          preamble = 0;
5097 5102                  } else {
5098 5103                          preamble = P2PHASE(offset, blksz);
5099 5104                          if (preamble) {
5100 5105                                  preamble = blksz - preamble;
5101 5106                                  size -= preamble;
5102 5107                          }
5103 5108                  }
5104 5109  
5105 5110                  postamble = P2PHASE(size, blksz);
5106 5111                  size -= postamble;
5107 5112  
5108 5113                  fullblk = size / blksz;
5109 5114                  (void) dmu_xuio_init(xuio,
5110 5115                      (preamble != 0) + fullblk + (postamble != 0));
5111 5116                  DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5112 5117                      int, postamble, int,
5113 5118                      (preamble != 0) + fullblk + (postamble != 0));
5114 5119  
5115 5120                  /*
5116 5121                   * Have to fix iov base/len for partial buffers.  They
5117 5122                   * currently represent full arc_buf's.
5118 5123                   */
5119 5124                  if (preamble) {
5120 5125                          /* data begins in the middle of the arc_buf */
5121 5126                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5122 5127                              blksz);
5123 5128                          ASSERT(abuf);
5124 5129                          (void) dmu_xuio_add(xuio, abuf,
5125 5130                              blksz - preamble, preamble);
5126 5131                  }
5127 5132  
5128 5133                  for (i = 0; i < fullblk; i++) {
5129 5134                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5130 5135                              blksz);
5131 5136                          ASSERT(abuf);
5132 5137                          (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5133 5138                  }
5134 5139  
5135 5140                  if (postamble) {
5136 5141                          /* data ends in the middle of the arc_buf */
5137 5142                          abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5138 5143                              blksz);
5139 5144                          ASSERT(abuf);
5140 5145                          (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5141 5146                  }
5142 5147                  break;
5143 5148          case UIO_READ:
5144 5149                  /*
5145 5150                   * Loan out an arc_buf for read if the read size is larger than
5146 5151                   * the current file block size.  Block alignment is not
5147 5152                   * considered.  Partial arc_buf will be loaned out for read.
5148 5153                   */
5149 5154                  blksz = zp->z_blksz;
5150 5155                  if (blksz < zcr_blksz_min)
5151 5156                          blksz = zcr_blksz_min;
5152 5157                  if (blksz > zcr_blksz_max)
5153 5158                          blksz = zcr_blksz_max;
5154 5159                  /* avoid potential complexity of dealing with it */
5155 5160                  if (blksz > max_blksz) {
5156 5161                          ZFS_EXIT(zfsvfs);
5157 5162                          return (SET_ERROR(EINVAL));
5158 5163                  }
5159 5164  
5160 5165                  maxsize = zp->z_size - uio->uio_loffset;
5161 5166                  if (size > maxsize)
5162 5167                          size = maxsize;
5163 5168  
5164 5169                  if (size < blksz || vn_has_cached_data(vp)) {
5165 5170                          ZFS_EXIT(zfsvfs);
5166 5171                          return (SET_ERROR(EINVAL));
5167 5172                  }
5168 5173                  break;
5169 5174          default:
5170 5175                  ZFS_EXIT(zfsvfs);
5171 5176                  return (SET_ERROR(EINVAL));
5172 5177          }
5173 5178  
5174 5179          uio->uio_extflg = UIO_XUIO;
5175 5180          XUIO_XUZC_RW(xuio) = ioflag;
5176 5181          ZFS_EXIT(zfsvfs);
5177 5182          return (0);
5178 5183  }
5179 5184  
5180 5185  /*ARGSUSED*/
5181 5186  static int
5182 5187  zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5183 5188  {
5184 5189          int i;
5185 5190          arc_buf_t *abuf;
5186 5191          int ioflag = XUIO_XUZC_RW(xuio);
5187 5192  
5188 5193          ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5189 5194  
5190 5195          i = dmu_xuio_cnt(xuio);
5191 5196          while (i-- > 0) {
5192 5197                  abuf = dmu_xuio_arcbuf(xuio, i);
5193 5198                  /*
5194 5199                   * if abuf == NULL, it must be a write buffer
5195 5200                   * that has been returned in zfs_write().
5196 5201                   */
5197 5202                  if (abuf)
5198 5203                          dmu_return_arcbuf(abuf);
5199 5204                  ASSERT(abuf || ioflag == UIO_WRITE);
5200 5205          }
5201 5206  
5202 5207          dmu_xuio_fini(xuio);
5203 5208          return (0);
5204 5209  }
5205 5210  
5206 5211  /*
5207 5212   * Predeclare these here so that the compiler assumes that
5208 5213   * this is an "old style" function declaration that does
5209 5214   * not include arguments => we won't get type mismatch errors
5210 5215   * in the initializations that follow.
5211 5216   */
5212 5217  static int zfs_inval();
5213 5218  static int zfs_isdir();
5214 5219  
5215 5220  static int
5216 5221  zfs_inval()
5217 5222  {
5218 5223          return (SET_ERROR(EINVAL));
5219 5224  }
5220 5225  
5221 5226  static int
5222 5227  zfs_isdir()
5223 5228  {
5224 5229          return (SET_ERROR(EISDIR));
5225 5230  }
5226 5231  /*
5227 5232   * Directory vnode operations template
5228 5233   */
5229 5234  vnodeops_t *zfs_dvnodeops;
5230 5235  const fs_operation_def_t zfs_dvnodeops_template[] = {
5231 5236          VOPNAME_OPEN,           { .vop_open = zfs_open },
5232 5237          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5233 5238          VOPNAME_READ,           { .error = zfs_isdir },
5234 5239          VOPNAME_WRITE,          { .error = zfs_isdir },
5235 5240          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5236 5241          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5237 5242          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5238 5243          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5239 5244          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5240 5245          VOPNAME_CREATE,         { .vop_create = zfs_create },
5241 5246          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5242 5247          VOPNAME_LINK,           { .vop_link = zfs_link },
5243 5248          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5244 5249          VOPNAME_MKDIR,          { .vop_mkdir = zfs_mkdir },
5245 5250          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5246 5251          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5247 5252          VOPNAME_SYMLINK,        { .vop_symlink = zfs_symlink },
5248 5253          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5249 5254          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5250 5255          VOPNAME_FID,            { .vop_fid = zfs_fid },
5251 5256          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5252 5257          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5253 5258          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5254 5259          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5255 5260          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5256 5261          NULL,                   NULL
5257 5262  };
5258 5263  
5259 5264  /*
5260 5265   * Regular file vnode operations template
5261 5266   */
5262 5267  vnodeops_t *zfs_fvnodeops;
5263 5268  const fs_operation_def_t zfs_fvnodeops_template[] = {
5264 5269          VOPNAME_OPEN,           { .vop_open = zfs_open },
5265 5270          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5266 5271          VOPNAME_READ,           { .vop_read = zfs_read },
5267 5272          VOPNAME_WRITE,          { .vop_write = zfs_write },
5268 5273          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5269 5274          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5270 5275          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5271 5276          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5272 5277          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5273 5278          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5274 5279          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5275 5280          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5276 5281          VOPNAME_FID,            { .vop_fid = zfs_fid },
5277 5282          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5278 5283          VOPNAME_FRLOCK,         { .vop_frlock = zfs_frlock },
5279 5284          VOPNAME_SPACE,          { .vop_space = zfs_space },
5280 5285          VOPNAME_GETPAGE,        { .vop_getpage = zfs_getpage },
5281 5286          VOPNAME_PUTPAGE,        { .vop_putpage = zfs_putpage },
5282 5287          VOPNAME_MAP,            { .vop_map = zfs_map },
5283 5288          VOPNAME_ADDMAP,         { .vop_addmap = zfs_addmap },
5284 5289          VOPNAME_DELMAP,         { .vop_delmap = zfs_delmap },
5285 5290          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5286 5291          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5287 5292          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5288 5293          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5289 5294          VOPNAME_REQZCBUF,       { .vop_reqzcbuf = zfs_reqzcbuf },
5290 5295          VOPNAME_RETZCBUF,       { .vop_retzcbuf = zfs_retzcbuf },
5291 5296          NULL,                   NULL
5292 5297  };
5293 5298  
5294 5299  /*
5295 5300   * Symbolic link vnode operations template
5296 5301   */
5297 5302  vnodeops_t *zfs_symvnodeops;
5298 5303  const fs_operation_def_t zfs_symvnodeops_template[] = {
5299 5304          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5300 5305          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5301 5306          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5302 5307          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5303 5308          VOPNAME_READLINK,       { .vop_readlink = zfs_readlink },
5304 5309          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5305 5310          VOPNAME_FID,            { .vop_fid = zfs_fid },
5306 5311          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5307 5312          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5308 5313          NULL,                   NULL
5309 5314  };
5310 5315  
5311 5316  /*
5312 5317   * special share hidden files vnode operations template
5313 5318   */
5314 5319  vnodeops_t *zfs_sharevnodeops;
5315 5320  const fs_operation_def_t zfs_sharevnodeops_template[] = {
5316 5321          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5317 5322          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5318 5323          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5319 5324          VOPNAME_FID,            { .vop_fid = zfs_fid },
5320 5325          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5321 5326          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5322 5327          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5323 5328          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5324 5329          NULL,                   NULL
5325 5330  };
5326 5331  
5327 5332  /*
5328 5333   * Extended attribute directory vnode operations template
5329 5334   *
5330 5335   * This template is identical to the directory vnodes
5331 5336   * operation template except for restricted operations:
5332 5337   *      VOP_MKDIR()
5333 5338   *      VOP_SYMLINK()
5334 5339   *
5335 5340   * Note that there are other restrictions embedded in:
5336 5341   *      zfs_create()    - restrict type to VREG
5337 5342   *      zfs_link()      - no links into/out of attribute space
5338 5343   *      zfs_rename()    - no moves into/out of attribute space
5339 5344   */
5340 5345  vnodeops_t *zfs_xdvnodeops;
5341 5346  const fs_operation_def_t zfs_xdvnodeops_template[] = {
5342 5347          VOPNAME_OPEN,           { .vop_open = zfs_open },
5343 5348          VOPNAME_CLOSE,          { .vop_close = zfs_close },
5344 5349          VOPNAME_IOCTL,          { .vop_ioctl = zfs_ioctl },
5345 5350          VOPNAME_GETATTR,        { .vop_getattr = zfs_getattr },
5346 5351          VOPNAME_SETATTR,        { .vop_setattr = zfs_setattr },
5347 5352          VOPNAME_ACCESS,         { .vop_access = zfs_access },
5348 5353          VOPNAME_LOOKUP,         { .vop_lookup = zfs_lookup },
5349 5354          VOPNAME_CREATE,         { .vop_create = zfs_create },
5350 5355          VOPNAME_REMOVE,         { .vop_remove = zfs_remove },
5351 5356          VOPNAME_LINK,           { .vop_link = zfs_link },
5352 5357          VOPNAME_RENAME,         { .vop_rename = zfs_rename },
5353 5358          VOPNAME_MKDIR,          { .error = zfs_inval },
5354 5359          VOPNAME_RMDIR,          { .vop_rmdir = zfs_rmdir },
5355 5360          VOPNAME_READDIR,        { .vop_readdir = zfs_readdir },
5356 5361          VOPNAME_SYMLINK,        { .error = zfs_inval },
5357 5362          VOPNAME_FSYNC,          { .vop_fsync = zfs_fsync },
5358 5363          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5359 5364          VOPNAME_FID,            { .vop_fid = zfs_fid },
5360 5365          VOPNAME_SEEK,           { .vop_seek = zfs_seek },
5361 5366          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5362 5367          VOPNAME_GETSECATTR,     { .vop_getsecattr = zfs_getsecattr },
5363 5368          VOPNAME_SETSECATTR,     { .vop_setsecattr = zfs_setsecattr },
5364 5369          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
5365 5370          NULL,                   NULL
5366 5371  };
5367 5372  
5368 5373  /*
5369 5374   * Error vnode operations template
5370 5375   */
5371 5376  vnodeops_t *zfs_evnodeops;
5372 5377  const fs_operation_def_t zfs_evnodeops_template[] = {
5373 5378          VOPNAME_INACTIVE,       { .vop_inactive = zfs_inactive },
5374 5379          VOPNAME_PATHCONF,       { .vop_pathconf = zfs_pathconf },
5375 5380          NULL,                   NULL
5376 5381  };

↓ open down ↓

569 lines elided

↑ open up ↑

XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX