Print this page
OS-4319 zfs mishandles partial writes
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/zfs_vnops.c
+++ new/usr/src/uts/common/fs/zfs/zfs_vnops.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21
22 22 /*
23 23 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
24 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 25 * Copyright (c) 2014 Integros [integros.com]
26 26 * Copyright 2015 Joyent, Inc.
27 27 * Copyright 2017 Nexenta Systems, Inc.
28 28 */
29 29
30 30 /* Portions Copyright 2007 Jeremy Teo */
31 31 /* Portions Copyright 2010 Robert Milkowski */
32 32
33 33 #include <sys/types.h>
34 34 #include <sys/param.h>
35 35 #include <sys/time.h>
36 36 #include <sys/systm.h>
37 37 #include <sys/sysmacros.h>
38 38 #include <sys/resource.h>
39 39 #include <sys/vfs.h>
40 40 #include <sys/vfs_opreg.h>
41 41 #include <sys/vnode.h>
42 42 #include <sys/file.h>
43 43 #include <sys/stat.h>
44 44 #include <sys/kmem.h>
45 45 #include <sys/taskq.h>
46 46 #include <sys/uio.h>
47 47 #include <sys/vmsystm.h>
48 48 #include <sys/atomic.h>
49 49 #include <sys/vm.h>
50 50 #include <vm/seg_vn.h>
51 51 #include <vm/pvn.h>
52 52 #include <vm/as.h>
53 53 #include <vm/kpm.h>
54 54 #include <vm/seg_kpm.h>
55 55 #include <sys/mman.h>
56 56 #include <sys/pathname.h>
57 57 #include <sys/cmn_err.h>
58 58 #include <sys/errno.h>
59 59 #include <sys/unistd.h>
60 60 #include <sys/zfs_dir.h>
61 61 #include <sys/zfs_acl.h>
62 62 #include <sys/zfs_ioctl.h>
63 63 #include <sys/fs/zfs.h>
64 64 #include <sys/dmu.h>
65 65 #include <sys/dmu_objset.h>
66 66 #include <sys/spa.h>
67 67 #include <sys/txg.h>
68 68 #include <sys/dbuf.h>
69 69 #include <sys/zap.h>
70 70 #include <sys/sa.h>
71 71 #include <sys/dirent.h>
72 72 #include <sys/policy.h>
73 73 #include <sys/sunddi.h>
74 74 #include <sys/filio.h>
75 75 #include <sys/sid.h>
76 76 #include "fs/fs_subr.h"
77 77 #include <sys/zfs_ctldir.h>
78 78 #include <sys/zfs_fuid.h>
79 79 #include <sys/zfs_sa.h>
80 80 #include <sys/dnlc.h>
81 81 #include <sys/zfs_rlock.h>
82 82 #include <sys/extdirent.h>
83 83 #include <sys/kidmap.h>
84 84 #include <sys/cred.h>
85 85 #include <sys/attr.h>
86 86 #include <sys/zil.h>
87 87
88 88 /*
89 89 * Programming rules.
90 90 *
91 91 * Each vnode op performs some logical unit of work. To do this, the ZPL must
92 92 * properly lock its in-core state, create a DMU transaction, do the work,
93 93 * record this work in the intent log (ZIL), commit the DMU transaction,
94 94 * and wait for the intent log to commit if it is a synchronous operation.
95 95 * Moreover, the vnode ops must work in both normal and log replay context.
96 96 * The ordering of events is important to avoid deadlocks and references
97 97 * to freed memory. The example below illustrates the following Big Rules:
98 98 *
99 99 * (1) A check must be made in each zfs thread for a mounted file system.
100 100 * This is done avoiding races using ZFS_ENTER(zfsvfs).
101 101 * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes
102 102 * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros
103 103 * can return EIO from the calling function.
104 104 *
105 105 * (2) VN_RELE() should always be the last thing except for zil_commit()
106 106 * (if necessary) and ZFS_EXIT(). This is for 3 reasons:
107 107 * First, if it's the last reference, the vnode/znode
108 108 * can be freed, so the zp may point to freed memory. Second, the last
109 109 * reference will call zfs_zinactive(), which may induce a lot of work --
110 110 * pushing cached pages (which acquires range locks) and syncing out
111 111 * cached atime changes. Third, zfs_zinactive() may require a new tx,
112 112 * which could deadlock the system if you were already holding one.
113 113 * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
114 114 *
115 115 * (3) All range locks must be grabbed before calling dmu_tx_assign(),
116 116 * as they can span dmu_tx_assign() calls.
117 117 *
118 118 * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to
119 119 * dmu_tx_assign(). This is critical because we don't want to block
120 120 * while holding locks.
121 121 *
122 122 * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This
123 123 * reduces lock contention and CPU usage when we must wait (note that if
124 124 * throughput is constrained by the storage, nearly every transaction
125 125 * must wait).
126 126 *
127 127 * Note, in particular, that if a lock is sometimes acquired before
128 128 * the tx assigns, and sometimes after (e.g. z_lock), then failing
129 129 * to use a non-blocking assign can deadlock the system. The scenario:
130 130 *
131 131 * Thread A has grabbed a lock before calling dmu_tx_assign().
132 132 * Thread B is in an already-assigned tx, and blocks for this lock.
133 133 * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open()
134 134 * forever, because the previous txg can't quiesce until B's tx commits.
135 135 *
136 136 * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT,
137 137 * then drop all locks, call dmu_tx_wait(), and try again. On subsequent
138 138 * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT,
139 139 * to indicate that this operation has already called dmu_tx_wait().
140 140 * This will ensure that we don't retry forever, waiting a short bit
141 141 * each time.
142 142 *
143 143 * (5) If the operation succeeded, generate the intent log entry for it
144 144 * before dropping locks. This ensures that the ordering of events
145 145 * in the intent log matches the order in which they actually occurred.
146 146 * During ZIL replay the zfs_log_* functions will update the sequence
147 147 * number to indicate the zil transaction has replayed.
148 148 *
149 149 * (6) At the end of each vnode op, the DMU tx must always commit,
150 150 * regardless of whether there were any errors.
151 151 *
152 152 * (7) After dropping all locks, invoke zil_commit(zilog, foid)
153 153 * to ensure that synchronous semantics are provided when necessary.
154 154 *
155 155 * In general, this is how things should be ordered in each vnode op:
156 156 *
157 157 * ZFS_ENTER(zfsvfs); // exit if unmounted
158 158 * top:
159 159 * zfs_dirent_lock(&dl, ...) // lock directory entry (may VN_HOLD())
160 160 * rw_enter(...); // grab any other locks you need
161 161 * tx = dmu_tx_create(...); // get DMU tx
162 162 * dmu_tx_hold_*(); // hold each object you might modify
163 163 * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
164 164 * if (error) {
165 165 * rw_exit(...); // drop locks
166 166 * zfs_dirent_unlock(dl); // unlock directory entry
167 167 * VN_RELE(...); // release held vnodes
168 168 * if (error == ERESTART) {
169 169 * waited = B_TRUE;
170 170 * dmu_tx_wait(tx);
171 171 * dmu_tx_abort(tx);
172 172 * goto top;
173 173 * }
174 174 * dmu_tx_abort(tx); // abort DMU tx
175 175 * ZFS_EXIT(zfsvfs); // finished in zfs
176 176 * return (error); // really out of space
177 177 * }
178 178 * error = do_real_work(); // do whatever this VOP does
179 179 * if (error == 0)
180 180 * zfs_log_*(...); // on success, make ZIL entry
181 181 * dmu_tx_commit(tx); // commit DMU tx -- error or not
182 182 * rw_exit(...); // drop locks
183 183 * zfs_dirent_unlock(dl); // unlock directory entry
184 184 * VN_RELE(...); // release held vnodes
185 185 * zil_commit(zilog, foid); // synchronous when necessary
186 186 * ZFS_EXIT(zfsvfs); // finished in zfs
187 187 * return (error); // done, report error
188 188 */
189 189
190 190 /* ARGSUSED */
191 191 static int
192 192 zfs_open(vnode_t **vpp, int flag, cred_t *cr, caller_context_t *ct)
193 193 {
194 194 znode_t *zp = VTOZ(*vpp);
195 195 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
196 196
197 197 ZFS_ENTER(zfsvfs);
198 198 ZFS_VERIFY_ZP(zp);
199 199
200 200 if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) &&
201 201 ((flag & FAPPEND) == 0)) {
202 202 ZFS_EXIT(zfsvfs);
203 203 return (SET_ERROR(EPERM));
204 204 }
205 205
206 206 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
207 207 ZTOV(zp)->v_type == VREG &&
208 208 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) {
209 209 if (fs_vscan(*vpp, cr, 0) != 0) {
210 210 ZFS_EXIT(zfsvfs);
211 211 return (SET_ERROR(EACCES));
212 212 }
213 213 }
214 214
215 215 /* Keep a count of the synchronous opens in the znode */
216 216 if (flag & (FSYNC | FDSYNC))
217 217 atomic_inc_32(&zp->z_sync_cnt);
218 218
219 219 ZFS_EXIT(zfsvfs);
220 220 return (0);
221 221 }
222 222
223 223 /* ARGSUSED */
224 224 static int
225 225 zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
226 226 caller_context_t *ct)
227 227 {
228 228 znode_t *zp = VTOZ(vp);
229 229 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
230 230
231 231 /*
232 232 * Clean up any locks held by this process on the vp.
233 233 */
234 234 cleanlocks(vp, ddi_get_pid(), 0);
235 235 cleanshares(vp, ddi_get_pid());
236 236
237 237 ZFS_ENTER(zfsvfs);
238 238 ZFS_VERIFY_ZP(zp);
239 239
240 240 /* Decrement the synchronous opens in the znode */
241 241 if ((flag & (FSYNC | FDSYNC)) && (count == 1))
242 242 atomic_dec_32(&zp->z_sync_cnt);
243 243
244 244 if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
245 245 ZTOV(zp)->v_type == VREG &&
246 246 !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0)
247 247 VERIFY(fs_vscan(vp, cr, 1) == 0);
248 248
249 249 ZFS_EXIT(zfsvfs);
250 250 return (0);
251 251 }
252 252
253 253 /*
254 254 * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and
255 255 * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter.
256 256 */
257 257 static int
258 258 zfs_holey(vnode_t *vp, int cmd, offset_t *off)
259 259 {
260 260 znode_t *zp = VTOZ(vp);
261 261 uint64_t noff = (uint64_t)*off; /* new offset */
262 262 uint64_t file_sz;
263 263 int error;
264 264 boolean_t hole;
265 265
266 266 file_sz = zp->z_size;
267 267 if (noff >= file_sz) {
268 268 return (SET_ERROR(ENXIO));
269 269 }
270 270
271 271 if (cmd == _FIO_SEEK_HOLE)
272 272 hole = B_TRUE;
273 273 else
274 274 hole = B_FALSE;
275 275
276 276 error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff);
277 277
278 278 if (error == ESRCH)
279 279 return (SET_ERROR(ENXIO));
280 280
281 281 /*
282 282 * We could find a hole that begins after the logical end-of-file,
283 283 * because dmu_offset_next() only works on whole blocks. If the
284 284 * EOF falls mid-block, then indicate that the "virtual hole"
285 285 * at the end of the file begins at the logical EOF, rather than
286 286 * at the end of the last block.
287 287 */
288 288 if (noff > file_sz) {
289 289 ASSERT(hole);
290 290 noff = file_sz;
291 291 }
292 292
293 293 if (noff < *off)
294 294 return (error);
295 295 *off = noff;
296 296 return (error);
297 297 }
298 298
299 299 /* ARGSUSED */
300 300 static int
301 301 zfs_ioctl(vnode_t *vp, int com, intptr_t data, int flag, cred_t *cred,
302 302 int *rvalp, caller_context_t *ct)
303 303 {
304 304 offset_t off;
305 305 offset_t ndata;
306 306 dmu_object_info_t doi;
307 307 int error;
308 308 zfsvfs_t *zfsvfs;
309 309 znode_t *zp;
310 310
311 311 switch (com) {
312 312 case _FIOFFS:
313 313 {
314 314 return (zfs_sync(vp->v_vfsp, 0, cred));
315 315
316 316 /*
317 317 * The following two ioctls are used by bfu. Faking out,
318 318 * necessary to avoid bfu errors.
319 319 */
320 320 }
321 321 case _FIOGDIO:
322 322 case _FIOSDIO:
323 323 {
324 324 return (0);
325 325 }
326 326
327 327 case _FIO_SEEK_DATA:
328 328 case _FIO_SEEK_HOLE:
329 329 {
330 330 if (ddi_copyin((void *)data, &off, sizeof (off), flag))
331 331 return (SET_ERROR(EFAULT));
332 332
333 333 zp = VTOZ(vp);
334 334 zfsvfs = zp->z_zfsvfs;
335 335 ZFS_ENTER(zfsvfs);
336 336 ZFS_VERIFY_ZP(zp);
337 337
338 338 /* offset parameter is in/out */
339 339 error = zfs_holey(vp, com, &off);
340 340 ZFS_EXIT(zfsvfs);
341 341 if (error)
342 342 return (error);
343 343 if (ddi_copyout(&off, (void *)data, sizeof (off), flag))
344 344 return (SET_ERROR(EFAULT));
345 345 return (0);
346 346 }
347 347 case _FIO_COUNT_FILLED:
348 348 {
349 349 /*
350 350 * _FIO_COUNT_FILLED adds a new ioctl command which
351 351 * exposes the number of filled blocks in a
352 352 * ZFS object.
353 353 */
354 354 zp = VTOZ(vp);
355 355 zfsvfs = zp->z_zfsvfs;
356 356 ZFS_ENTER(zfsvfs);
357 357 ZFS_VERIFY_ZP(zp);
358 358
359 359 /*
360 360 * Wait for all dirty blocks for this object
361 361 * to get synced out to disk, and the DMU info
362 362 * updated.
363 363 */
364 364 error = dmu_object_wait_synced(zfsvfs->z_os, zp->z_id);
365 365 if (error) {
366 366 ZFS_EXIT(zfsvfs);
367 367 return (error);
368 368 }
369 369
370 370 /*
371 371 * Retrieve fill count from DMU object.
372 372 */
373 373 error = dmu_object_info(zfsvfs->z_os, zp->z_id, &doi);
374 374 if (error) {
375 375 ZFS_EXIT(zfsvfs);
376 376 return (error);
377 377 }
378 378
379 379 ndata = doi.doi_fill_count;
380 380
381 381 ZFS_EXIT(zfsvfs);
382 382 if (ddi_copyout(&ndata, (void *)data, sizeof (ndata), flag))
383 383 return (SET_ERROR(EFAULT));
384 384 return (0);
385 385 }
386 386 }
387 387 return (SET_ERROR(ENOTTY));
388 388 }
389 389
390 390 /*
391 391 * Utility functions to map and unmap a single physical page. These
392 392 * are used to manage the mappable copies of ZFS file data, and therefore
393 393 * do not update ref/mod bits.
394 394 */
395 395 caddr_t
396 396 zfs_map_page(page_t *pp, enum seg_rw rw)
397 397 {
398 398 if (kpm_enable)
399 399 return (hat_kpm_mapin(pp, 0));
400 400 ASSERT(rw == S_READ || rw == S_WRITE);
401 401 return (ppmapin(pp, PROT_READ | ((rw == S_WRITE) ? PROT_WRITE : 0),
402 402 (caddr_t)-1));
403 403 }
404 404
405 405 void
406 406 zfs_unmap_page(page_t *pp, caddr_t addr)
407 407 {
408 408 if (kpm_enable) {
409 409 hat_kpm_mapout(pp, 0, addr);
410 410 } else {
411 411 ppmapout(addr);
412 412 }
413 413 }
414 414
415 415 /*
416 416 * When a file is memory mapped, we must keep the IO data synchronized
417 417 * between the DMU cache and the memory mapped pages. What this means:
418 418 *
419 419 * On Write: If we find a memory mapped page, we write to *both*
420 420 * the page and the dmu buffer.
421 421 */
422 422 static void
423 423 update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
424 424 {
425 425 int64_t off;
426 426
427 427 off = start & PAGEOFFSET;
428 428 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
429 429 page_t *pp;
430 430 uint64_t nbytes = MIN(PAGESIZE - off, len);
431 431
432 432 if (pp = page_lookup(vp, start, SE_SHARED)) {
433 433 caddr_t va;
434 434
435 435 va = zfs_map_page(pp, S_WRITE);
436 436 (void) dmu_read(os, oid, start+off, nbytes, va+off,
437 437 DMU_READ_PREFETCH);
438 438 zfs_unmap_page(pp, va);
439 439 page_unlock(pp);
440 440 }
441 441 len -= nbytes;
442 442 off = 0;
443 443 }
444 444 }
445 445
446 446 /*
447 447 * When a file is memory mapped, we must keep the IO data synchronized
448 448 * between the DMU cache and the memory mapped pages. What this means:
449 449 *
450 450 * On Read: We "read" preferentially from memory mapped pages,
451 451 * else we default from the dmu buffer.
452 452 *
453 453 * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
454 454 * the file is memory mapped.
455 455 */
456 456 static int
457 457 mappedread(vnode_t *vp, int nbytes, uio_t *uio)
458 458 {
459 459 znode_t *zp = VTOZ(vp);
460 460 int64_t start, off;
461 461 int len = nbytes;
462 462 int error = 0;
463 463
464 464 start = uio->uio_loffset;
465 465 off = start & PAGEOFFSET;
466 466 for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
467 467 page_t *pp;
468 468 uint64_t bytes = MIN(PAGESIZE - off, len);
469 469
470 470 if (pp = page_lookup(vp, start, SE_SHARED)) {
471 471 caddr_t va;
472 472
473 473 va = zfs_map_page(pp, S_READ);
474 474 error = uiomove(va + off, bytes, UIO_READ, uio);
475 475 zfs_unmap_page(pp, va);
476 476 page_unlock(pp);
477 477 } else {
478 478 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
479 479 uio, bytes);
480 480 }
481 481 len -= bytes;
482 482 off = 0;
483 483 if (error)
484 484 break;
485 485 }
486 486 return (error);
487 487 }
488 488
489 489 offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */
490 490
491 491 /*
492 492 * Read bytes from specified file into supplied buffer.
493 493 *
494 494 * IN: vp - vnode of file to be read from.
495 495 * uio - structure supplying read location, range info,
496 496 * and return buffer.
497 497 * ioflag - SYNC flags; used to provide FRSYNC semantics.
498 498 * cr - credentials of caller.
499 499 * ct - caller context
500 500 *
501 501 * OUT: uio - updated offset and range, buffer filled.
502 502 *
503 503 * RETURN: 0 on success, error code on failure.
504 504 *
505 505 * Side Effects:
506 506 * vp - atime updated if byte count > 0
507 507 */
508 508 /* ARGSUSED */
509 509 static int
510 510 zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
511 511 {
512 512 znode_t *zp = VTOZ(vp);
513 513 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
514 514 ssize_t n, nbytes;
515 515 int error = 0;
516 516 xuio_t *xuio = NULL;
517 517
518 518 ZFS_ENTER(zfsvfs);
519 519 ZFS_VERIFY_ZP(zp);
520 520
521 521 if (zp->z_pflags & ZFS_AV_QUARANTINED) {
522 522 ZFS_EXIT(zfsvfs);
523 523 return (SET_ERROR(EACCES));
524 524 }
525 525
526 526 /*
527 527 * Validate file offset
528 528 */
529 529 if (uio->uio_loffset < (offset_t)0) {
530 530 ZFS_EXIT(zfsvfs);
531 531 return (SET_ERROR(EINVAL));
532 532 }
533 533
534 534 /*
535 535 * Fasttrack empty reads
536 536 */
537 537 if (uio->uio_resid == 0) {
538 538 ZFS_EXIT(zfsvfs);
539 539 return (0);
540 540 }
541 541
542 542 /*
543 543 * Check for mandatory locks
544 544 */
545 545 if (MANDMODE(zp->z_mode)) {
546 546 if (error = chklock(vp, FREAD,
547 547 uio->uio_loffset, uio->uio_resid, uio->uio_fmode, ct)) {
548 548 ZFS_EXIT(zfsvfs);
549 549 return (error);
550 550 }
551 551 }
552 552
553 553 /*
554 554 * If we're in FRSYNC mode, sync out this znode before reading it.
555 555 */
556 556 if (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
557 557 zil_commit(zfsvfs->z_log, zp->z_id);
558 558
559 559 /*
560 560 * Lock the range against changes.
561 561 */
562 562 locked_range_t *lr = rangelock_enter(&zp->z_rangelock,
563 563 uio->uio_loffset, uio->uio_resid, RL_READER);
564 564
565 565 /*
566 566 * If we are reading past end-of-file we can skip
567 567 * to the end; but we might still need to set atime.
568 568 */
569 569 if (uio->uio_loffset >= zp->z_size) {
570 570 error = 0;
571 571 goto out;
572 572 }
573 573
574 574 ASSERT(uio->uio_loffset < zp->z_size);
575 575 n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset);
576 576
577 577 if ((uio->uio_extflg == UIO_XUIO) &&
578 578 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
579 579 int nblk;
580 580 int blksz = zp->z_blksz;
581 581 uint64_t offset = uio->uio_loffset;
582 582
583 583 xuio = (xuio_t *)uio;
584 584 if ((ISP2(blksz))) {
585 585 nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
586 586 blksz)) / blksz;
587 587 } else {
588 588 ASSERT(offset + n <= blksz);
589 589 nblk = 1;
590 590 }
591 591 (void) dmu_xuio_init(xuio, nblk);
592 592
593 593 if (vn_has_cached_data(vp)) {
594 594 /*
595 595 * For simplicity, we always allocate a full buffer
596 596 * even if we only expect to read a portion of a block.
597 597 */
598 598 while (--nblk >= 0) {
599 599 (void) dmu_xuio_add(xuio,
600 600 dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
601 601 blksz), 0, blksz);
602 602 }
603 603 }
604 604 }
605 605
606 606 while (n > 0) {
607 607 nbytes = MIN(n, zfs_read_chunk_size -
608 608 P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
609 609
610 610 if (vn_has_cached_data(vp)) {
611 611 error = mappedread(vp, nbytes, uio);
612 612 } else {
613 613 error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl),
614 614 uio, nbytes);
615 615 }
616 616 if (error) {
617 617 /* convert checksum errors into IO errors */
618 618 if (error == ECKSUM)
619 619 error = SET_ERROR(EIO);
620 620 break;
621 621 }
622 622
623 623 n -= nbytes;
624 624 }
625 625 out:
626 626 rangelock_exit(lr);
627 627
628 628 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
629 629 ZFS_EXIT(zfsvfs);
630 630 return (error);
631 631 }
632 632
633 633 /*
634 634 * Write the bytes to a file.
635 635 *
636 636 * IN: vp - vnode of file to be written to.
637 637 * uio - structure supplying write location, range info,
638 638 * and data buffer.
639 639 * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is
640 640 * set if in append mode.
641 641 * cr - credentials of caller.
642 642 * ct - caller context (NFS/CIFS fem monitor only)
643 643 *
644 644 * OUT: uio - updated offset and range.
645 645 *
646 646 * RETURN: 0 on success, error code on failure.
647 647 *
648 648 * Timestamps:
649 649 * vp - ctime|mtime updated if byte count > 0
650 650 */
651 651
652 652 /* ARGSUSED */
653 653 static int
654 654 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
655 655 {
656 656 znode_t *zp = VTOZ(vp);
657 657 rlim64_t limit = uio->uio_llimit;
|
↓ open down ↓ |
657 lines elided |
↑ open up ↑ |
658 658 ssize_t start_resid = uio->uio_resid;
659 659 ssize_t tx_bytes;
660 660 uint64_t end_size;
661 661 dmu_tx_t *tx;
662 662 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
663 663 zilog_t *zilog;
664 664 offset_t woff;
665 665 ssize_t n, nbytes;
666 666 int max_blksz = zfsvfs->z_max_blksz;
667 667 int error = 0;
668 + int prev_error;
668 669 arc_buf_t *abuf;
669 670 iovec_t *aiov = NULL;
670 671 xuio_t *xuio = NULL;
671 672 int i_iov = 0;
672 673 int iovcnt = uio->uio_iovcnt;
673 674 iovec_t *iovp = uio->uio_iov;
674 675 int write_eof;
675 676 int count = 0;
676 677 sa_bulk_attr_t bulk[4];
677 678 uint64_t mtime[2], ctime[2];
678 679
679 680 /*
680 681 * Fasttrack empty write
681 682 */
682 683 n = start_resid;
683 684 if (n == 0)
684 685 return (0);
685 686
686 687 if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
687 688 limit = MAXOFFSET_T;
688 689
689 690 ZFS_ENTER(zfsvfs);
690 691 ZFS_VERIFY_ZP(zp);
691 692
692 693 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
693 694 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
694 695 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
695 696 &zp->z_size, 8);
696 697 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
697 698 &zp->z_pflags, 8);
698 699
699 700 /*
700 701 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
701 702 * callers might not be able to detect properly that we are read-only,
702 703 * so check it explicitly here.
703 704 */
704 705 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
705 706 ZFS_EXIT(zfsvfs);
706 707 return (SET_ERROR(EROFS));
707 708 }
708 709
709 710 /*
710 711 * If immutable or not appending then return EPERM.
711 712 * Intentionally allow ZFS_READONLY through here.
712 713 * See zfs_zaccess_common()
713 714 */
714 715 if ((zp->z_pflags & ZFS_IMMUTABLE) ||
715 716 ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) &&
716 717 (uio->uio_loffset < zp->z_size))) {
717 718 ZFS_EXIT(zfsvfs);
718 719 return (SET_ERROR(EPERM));
719 720 }
720 721
721 722 zilog = zfsvfs->z_log;
722 723
723 724 /*
724 725 * Validate file offset
725 726 */
726 727 woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset;
727 728 if (woff < 0) {
728 729 ZFS_EXIT(zfsvfs);
729 730 return (SET_ERROR(EINVAL));
730 731 }
731 732
732 733 /*
733 734 * Check for mandatory locks before calling rangelock_enter()
734 735 * in order to prevent a deadlock with locks set via fcntl().
735 736 */
736 737 if (MANDMODE((mode_t)zp->z_mode) &&
737 738 (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
738 739 ZFS_EXIT(zfsvfs);
739 740 return (error);
740 741 }
741 742
742 743 /*
743 744 * Pre-fault the pages to ensure slow (eg NFS) pages
744 745 * don't hold up txg.
745 746 * Skip this if uio contains loaned arc_buf.
746 747 */
747 748 if ((uio->uio_extflg == UIO_XUIO) &&
748 749 (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
749 750 xuio = (xuio_t *)uio;
750 751 else
751 752 uio_prefaultpages(MIN(n, max_blksz), uio);
752 753
753 754 /*
754 755 * If in append mode, set the io offset pointer to eof.
755 756 */
756 757 locked_range_t *lr;
757 758 if (ioflag & FAPPEND) {
758 759 /*
759 760 * Obtain an appending range lock to guarantee file append
760 761 * semantics. We reset the write offset once we have the lock.
761 762 */
762 763 lr = rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND);
763 764 woff = lr->lr_offset;
764 765 if (lr->lr_length == UINT64_MAX) {
765 766 /*
766 767 * We overlocked the file because this write will cause
767 768 * the file block size to increase.
768 769 * Note that zp_size cannot change with this lock held.
769 770 */
770 771 woff = zp->z_size;
771 772 }
772 773 uio->uio_loffset = woff;
773 774 } else {
774 775 /*
775 776 * Note that if the file block size will change as a result of
776 777 * this write, then this range lock will lock the entire file
777 778 * so that we can re-write the block safely.
778 779 */
779 780 lr = rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER);
780 781 }
781 782
782 783 if (woff >= limit) {
783 784 rangelock_exit(lr);
784 785 ZFS_EXIT(zfsvfs);
785 786 return (SET_ERROR(EFBIG));
786 787 }
787 788
788 789 if ((woff + n) > limit || woff > (limit - n))
789 790 n = limit - woff;
790 791
791 792 /* Will this write extend the file length? */
792 793 write_eof = (woff + n > zp->z_size);
793 794
794 795 end_size = MAX(zp->z_size, woff + n);
795 796
796 797 /*
797 798 * Write the file in reasonable size chunks. Each chunk is written
798 799 * in a separate transaction; this keeps the intent log records small
799 800 * and allows us to do more fine-grained space accounting.
800 801 */
801 802 while (n > 0) {
802 803 abuf = NULL;
803 804 woff = uio->uio_loffset;
804 805 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
805 806 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
806 807 if (abuf != NULL)
807 808 dmu_return_arcbuf(abuf);
808 809 error = SET_ERROR(EDQUOT);
809 810 break;
810 811 }
811 812
812 813 if (xuio && abuf == NULL) {
813 814 ASSERT(i_iov < iovcnt);
814 815 aiov = &iovp[i_iov];
815 816 abuf = dmu_xuio_arcbuf(xuio, i_iov);
816 817 dmu_xuio_clear(xuio, i_iov);
817 818 DTRACE_PROBE3(zfs_cp_write, int, i_iov,
818 819 iovec_t *, aiov, arc_buf_t *, abuf);
819 820 ASSERT((aiov->iov_base == abuf->b_data) ||
820 821 ((char *)aiov->iov_base - (char *)abuf->b_data +
821 822 aiov->iov_len == arc_buf_size(abuf)));
822 823 i_iov++;
823 824 } else if (abuf == NULL && n >= max_blksz &&
824 825 woff >= zp->z_size &&
825 826 P2PHASE(woff, max_blksz) == 0 &&
826 827 zp->z_blksz == max_blksz) {
827 828 /*
828 829 * This write covers a full block. "Borrow" a buffer
829 830 * from the dmu so that we can fill it before we enter
830 831 * a transaction. This avoids the possibility of
831 832 * holding up the transaction if the data copy hangs
832 833 * up on a pagefault (e.g., from an NFS server mapping).
833 834 */
834 835 size_t cbytes;
835 836
836 837 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
837 838 max_blksz);
838 839 ASSERT(abuf != NULL);
839 840 ASSERT(arc_buf_size(abuf) == max_blksz);
840 841 if (error = uiocopy(abuf->b_data, max_blksz,
841 842 UIO_WRITE, uio, &cbytes)) {
842 843 dmu_return_arcbuf(abuf);
843 844 break;
844 845 }
845 846 ASSERT(cbytes == max_blksz);
846 847 }
847 848
848 849 /*
849 850 * Start a transaction.
850 851 */
851 852 tx = dmu_tx_create(zfsvfs->z_os);
852 853 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
853 854 dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
854 855 zfs_sa_upgrade_txholds(tx, zp);
855 856 error = dmu_tx_assign(tx, TXG_WAIT);
856 857 if (error) {
857 858 dmu_tx_abort(tx);
858 859 if (abuf != NULL)
859 860 dmu_return_arcbuf(abuf);
860 861 break;
861 862 }
862 863
863 864 /*
864 865 * If rangelock_enter() over-locked we grow the blocksize
865 866 * and then reduce the lock range. This will only happen
866 867 * on the first iteration since rangelock_reduce() will
867 868 * shrink down lr_length to the appropriate size.
868 869 */
869 870 if (lr->lr_length == UINT64_MAX) {
870 871 uint64_t new_blksz;
871 872
872 873 if (zp->z_blksz > max_blksz) {
873 874 /*
874 875 * File's blocksize is already larger than the
875 876 * "recordsize" property. Only let it grow to
876 877 * the next power of 2.
877 878 */
878 879 ASSERT(!ISP2(zp->z_blksz));
879 880 new_blksz = MIN(end_size,
880 881 1 << highbit64(zp->z_blksz));
881 882 } else {
882 883 new_blksz = MIN(end_size, max_blksz);
883 884 }
884 885 zfs_grow_blocksize(zp, new_blksz, tx);
885 886 rangelock_reduce(lr, woff, n);
886 887 }
887 888
888 889 /*
889 890 * XXX - should we really limit each write to z_max_blksz?
890 891 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
891 892 */
892 893 nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
893 894
894 895 if (abuf == NULL) {
895 896 tx_bytes = uio->uio_resid;
896 897 error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl),
897 898 uio, nbytes, tx);
898 899 tx_bytes -= uio->uio_resid;
899 900 } else {
900 901 tx_bytes = nbytes;
901 902 ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
902 903 /*
903 904 * If this is not a full block write, but we are
904 905 * extending the file past EOF and this data starts
905 906 * block-aligned, use assign_arcbuf(). Otherwise,
906 907 * write via dmu_write().
907 908 */
908 909 if (tx_bytes < max_blksz && (!write_eof ||
909 910 aiov->iov_base != abuf->b_data)) {
910 911 ASSERT(xuio);
911 912 dmu_write(zfsvfs->z_os, zp->z_id, woff,
912 913 aiov->iov_len, aiov->iov_base, tx);
913 914 dmu_return_arcbuf(abuf);
914 915 xuio_stat_wbuf_copied();
915 916 } else {
916 917 ASSERT(xuio || tx_bytes == max_blksz);
917 918 dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl),
918 919 woff, abuf, tx);
919 920 }
920 921 ASSERT(tx_bytes <= uio->uio_resid);
921 922 uioskip(uio, tx_bytes);
922 923 }
923 924 if (tx_bytes && vn_has_cached_data(vp)) {
924 925 update_pages(vp, woff,
925 926 tx_bytes, zfsvfs->z_os, zp->z_id);
926 927 }
927 928
928 929 /*
929 930 * If we made no progress, we're done. If we made even
930 931 * partial progress, update the znode and ZIL accordingly.
931 932 */
932 933 if (tx_bytes == 0) {
933 934 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
934 935 (void *)&zp->z_size, sizeof (uint64_t), tx);
935 936 dmu_tx_commit(tx);
936 937 ASSERT(error != 0);
937 938 break;
938 939 }
939 940
940 941 /*
941 942 * Clear Set-UID/Set-GID bits on successful write if not
942 943 * privileged and at least one of the excute bits is set.
943 944 *
944 945 * It would be nice to to this after all writes have
945 946 * been done, but that would still expose the ISUID/ISGID
946 947 * to another app after the partial write is committed.
947 948 *
948 949 * Note: we don't call zfs_fuid_map_id() here because
949 950 * user 0 is not an ephemeral uid.
950 951 */
951 952 mutex_enter(&zp->z_acl_lock);
952 953 if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) |
953 954 (S_IXUSR >> 6))) != 0 &&
954 955 (zp->z_mode & (S_ISUID | S_ISGID)) != 0 &&
955 956 secpolicy_vnode_setid_retain(cr,
956 957 (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
957 958 uint64_t newmode;
958 959 zp->z_mode &= ~(S_ISUID | S_ISGID);
959 960 newmode = zp->z_mode;
960 961 (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
961 962 (void *)&newmode, sizeof (uint64_t), tx);
962 963 }
963 964 mutex_exit(&zp->z_acl_lock);
964 965
|
↓ open down ↓ |
287 lines elided |
↑ open up ↑ |
965 966 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
966 967 B_TRUE);
967 968
968 969 /*
969 970 * Update the file size (zp_size) if it has changed;
970 971 * account for possible concurrent updates.
971 972 */
972 973 while ((end_size = zp->z_size) < uio->uio_loffset) {
973 974 (void) atomic_cas_64(&zp->z_size, end_size,
974 975 uio->uio_loffset);
975 - ASSERT(error == 0);
976 976 }
977 977 /*
978 978 * If we are replaying and eof is non zero then force
979 979 * the file size to the specified eof. Note, there's no
980 980 * concurrency during replay.
981 981 */
982 982 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
983 983 zp->z_size = zfsvfs->z_replay_eof;
984 984
985 + /*
986 + * Keep track of a possible pre-existing error from a partial
987 + * write via dmu_write_uio_dbuf above.
988 + */
989 + prev_error = error;
985 990 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
986 991
987 992 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
988 993 dmu_tx_commit(tx);
989 994
990 - if (error != 0)
995 + if (prev_error != 0 || error != 0)
991 996 break;
992 997 ASSERT(tx_bytes == nbytes);
993 998 n -= nbytes;
994 999
995 1000 if (!xuio && n > 0)
996 1001 uio_prefaultpages(MIN(n, max_blksz), uio);
997 1002 }
998 1003
999 1004 rangelock_exit(lr);
1000 1005
1001 1006 /*
1002 1007 * If we're in replay mode, or we made no progress, return error.
1003 1008 * Otherwise, it's at least a partial write, so it's successful.
1004 1009 */
1005 1010 if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1006 1011 ZFS_EXIT(zfsvfs);
1007 1012 return (error);
1008 1013 }
1009 1014
1010 1015 if (ioflag & (FSYNC | FDSYNC) ||
1011 1016 zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1012 1017 zil_commit(zilog, zp->z_id);
1013 1018
1014 1019 ZFS_EXIT(zfsvfs);
1015 1020 return (0);
1016 1021 }
1017 1022
1018 1023 /* ARGSUSED */
1019 1024 void
1020 1025 zfs_get_done(zgd_t *zgd, int error)
1021 1026 {
1022 1027 znode_t *zp = zgd->zgd_private;
1023 1028 objset_t *os = zp->z_zfsvfs->z_os;
1024 1029
1025 1030 if (zgd->zgd_db)
1026 1031 dmu_buf_rele(zgd->zgd_db, zgd);
1027 1032
1028 1033 rangelock_exit(zgd->zgd_lr);
1029 1034
1030 1035 /*
1031 1036 * Release the vnode asynchronously as we currently have the
1032 1037 * txg stopped from syncing.
1033 1038 */
1034 1039 VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1035 1040
1036 1041 kmem_free(zgd, sizeof (zgd_t));
1037 1042 }
1038 1043
1039 1044 #ifdef DEBUG
1040 1045 static int zil_fault_io = 0;
1041 1046 #endif
1042 1047
1043 1048 /*
1044 1049 * Get data to generate a TX_WRITE intent log record.
1045 1050 */
1046 1051 int
1047 1052 zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio)
1048 1053 {
1049 1054 zfsvfs_t *zfsvfs = arg;
1050 1055 objset_t *os = zfsvfs->z_os;
1051 1056 znode_t *zp;
1052 1057 uint64_t object = lr->lr_foid;
1053 1058 uint64_t offset = lr->lr_offset;
1054 1059 uint64_t size = lr->lr_length;
1055 1060 dmu_buf_t *db;
1056 1061 zgd_t *zgd;
1057 1062 int error = 0;
1058 1063
1059 1064 ASSERT3P(lwb, !=, NULL);
1060 1065 ASSERT3P(zio, !=, NULL);
1061 1066 ASSERT3U(size, !=, 0);
1062 1067
1063 1068 /*
1064 1069 * Nothing to do if the file has been removed
1065 1070 */
1066 1071 if (zfs_zget(zfsvfs, object, &zp) != 0)
1067 1072 return (SET_ERROR(ENOENT));
1068 1073 if (zp->z_unlinked) {
1069 1074 /*
1070 1075 * Release the vnode asynchronously as we currently have the
1071 1076 * txg stopped from syncing.
1072 1077 */
1073 1078 VN_RELE_ASYNC(ZTOV(zp),
1074 1079 dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
1075 1080 return (SET_ERROR(ENOENT));
1076 1081 }
1077 1082
1078 1083 zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
1079 1084 zgd->zgd_lwb = lwb;
1080 1085 zgd->zgd_private = zp;
1081 1086
1082 1087 /*
1083 1088 * Write records come in two flavors: immediate and indirect.
1084 1089 * For small writes it's cheaper to store the data with the
1085 1090 * log record (immediate); for large writes it's cheaper to
1086 1091 * sync the data and get a pointer to it (indirect) so that
1087 1092 * we don't have to write the data twice.
1088 1093 */
1089 1094 if (buf != NULL) { /* immediate write */
1090 1095 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1091 1096 offset, size, RL_READER);
1092 1097 /* test for truncation needs to be done while range locked */
1093 1098 if (offset >= zp->z_size) {
1094 1099 error = SET_ERROR(ENOENT);
1095 1100 } else {
1096 1101 error = dmu_read(os, object, offset, size, buf,
1097 1102 DMU_READ_NO_PREFETCH);
1098 1103 }
1099 1104 ASSERT(error == 0 || error == ENOENT);
1100 1105 } else { /* indirect write */
1101 1106 /*
1102 1107 * Have to lock the whole block to ensure when it's
1103 1108 * written out and its checksum is being calculated
1104 1109 * that no one can change the data. We need to re-check
1105 1110 * blocksize after we get the lock in case it's changed!
1106 1111 */
1107 1112 for (;;) {
1108 1113 uint64_t blkoff;
1109 1114 size = zp->z_blksz;
1110 1115 blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
1111 1116 offset -= blkoff;
1112 1117 zgd->zgd_lr = rangelock_enter(&zp->z_rangelock,
1113 1118 offset, size, RL_READER);
1114 1119 if (zp->z_blksz == size)
1115 1120 break;
1116 1121 offset += blkoff;
1117 1122 rangelock_exit(zgd->zgd_lr);
1118 1123 }
1119 1124 /* test for truncation needs to be done while range locked */
1120 1125 if (lr->lr_offset >= zp->z_size)
1121 1126 error = SET_ERROR(ENOENT);
1122 1127 #ifdef DEBUG
1123 1128 if (zil_fault_io) {
1124 1129 error = SET_ERROR(EIO);
1125 1130 zil_fault_io = 0;
1126 1131 }
1127 1132 #endif
1128 1133 if (error == 0)
1129 1134 error = dmu_buf_hold(os, object, offset, zgd, &db,
1130 1135 DMU_READ_NO_PREFETCH);
1131 1136
1132 1137 if (error == 0) {
1133 1138 blkptr_t *bp = &lr->lr_blkptr;
1134 1139
1135 1140 zgd->zgd_db = db;
1136 1141 zgd->zgd_bp = bp;
1137 1142
1138 1143 ASSERT(db->db_offset == offset);
1139 1144 ASSERT(db->db_size == size);
1140 1145
1141 1146 error = dmu_sync(zio, lr->lr_common.lrc_txg,
1142 1147 zfs_get_done, zgd);
1143 1148 ASSERT(error || lr->lr_length <= size);
1144 1149
1145 1150 /*
1146 1151 * On success, we need to wait for the write I/O
1147 1152 * initiated by dmu_sync() to complete before we can
1148 1153 * release this dbuf. We will finish everything up
1149 1154 * in the zfs_get_done() callback.
1150 1155 */
1151 1156 if (error == 0)
1152 1157 return (0);
1153 1158
1154 1159 if (error == EALREADY) {
1155 1160 lr->lr_common.lrc_txtype = TX_WRITE2;
1156 1161 /*
1157 1162 * TX_WRITE2 relies on the data previously
1158 1163 * written by the TX_WRITE that caused
1159 1164 * EALREADY. We zero out the BP because
1160 1165 * it is the old, currently-on-disk BP.
1161 1166 */
1162 1167 zgd->zgd_bp = NULL;
1163 1168 BP_ZERO(bp);
1164 1169 error = 0;
1165 1170 }
1166 1171 }
1167 1172 }
1168 1173
1169 1174 zfs_get_done(zgd, error);
1170 1175
1171 1176 return (error);
1172 1177 }
1173 1178
1174 1179 /*ARGSUSED*/
1175 1180 static int
1176 1181 zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
1177 1182 caller_context_t *ct)
1178 1183 {
1179 1184 znode_t *zp = VTOZ(vp);
1180 1185 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
1181 1186 int error;
1182 1187
1183 1188 ZFS_ENTER(zfsvfs);
1184 1189 ZFS_VERIFY_ZP(zp);
1185 1190
1186 1191 if (flag & V_ACE_MASK)
1187 1192 error = zfs_zaccess(zp, mode, flag, B_FALSE, cr);
1188 1193 else
1189 1194 error = zfs_zaccess_rwx(zp, mode, flag, cr);
1190 1195
1191 1196 ZFS_EXIT(zfsvfs);
1192 1197 return (error);
1193 1198 }
1194 1199
1195 1200 /*
1196 1201 * If vnode is for a device return a specfs vnode instead.
1197 1202 */
1198 1203 static int
1199 1204 specvp_check(vnode_t **vpp, cred_t *cr)
1200 1205 {
1201 1206 int error = 0;
1202 1207
1203 1208 if (IS_DEVVP(*vpp)) {
1204 1209 struct vnode *svp;
1205 1210
1206 1211 svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
1207 1212 VN_RELE(*vpp);
1208 1213 if (svp == NULL)
1209 1214 error = SET_ERROR(ENOSYS);
1210 1215 *vpp = svp;
1211 1216 }
1212 1217 return (error);
1213 1218 }
1214 1219
1215 1220
1216 1221 /*
1217 1222 * Lookup an entry in a directory, or an extended attribute directory.
1218 1223 * If it exists, return a held vnode reference for it.
1219 1224 *
1220 1225 * IN: dvp - vnode of directory to search.
1221 1226 * nm - name of entry to lookup.
1222 1227 * pnp - full pathname to lookup [UNUSED].
1223 1228 * flags - LOOKUP_XATTR set if looking for an attribute.
1224 1229 * rdir - root directory vnode [UNUSED].
1225 1230 * cr - credentials of caller.
1226 1231 * ct - caller context
1227 1232 * direntflags - directory lookup flags
1228 1233 * realpnp - returned pathname.
1229 1234 *
1230 1235 * OUT: vpp - vnode of located entry, NULL if not found.
1231 1236 *
1232 1237 * RETURN: 0 on success, error code on failure.
1233 1238 *
1234 1239 * Timestamps:
1235 1240 * NA
1236 1241 */
1237 1242 /* ARGSUSED */
1238 1243 static int
1239 1244 zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
1240 1245 int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
1241 1246 int *direntflags, pathname_t *realpnp)
1242 1247 {
1243 1248 znode_t *zdp = VTOZ(dvp);
1244 1249 zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
1245 1250 int error = 0;
1246 1251
1247 1252 /*
1248 1253 * Fast path lookup, however we must skip DNLC lookup
1249 1254 * for case folding or normalizing lookups because the
1250 1255 * DNLC code only stores the passed in name. This means
1251 1256 * creating 'a' and removing 'A' on a case insensitive
1252 1257 * file system would work, but DNLC still thinks 'a'
1253 1258 * exists and won't let you create it again on the next
1254 1259 * pass through fast path.
1255 1260 */
1256 1261 if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
1257 1262
1258 1263 if (dvp->v_type != VDIR) {
1259 1264 return (SET_ERROR(ENOTDIR));
1260 1265 } else if (zdp->z_sa_hdl == NULL) {
1261 1266 return (SET_ERROR(EIO));
1262 1267 }
1263 1268
1264 1269 if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
1265 1270 error = zfs_fastaccesschk_execute(zdp, cr);
1266 1271 if (!error) {
1267 1272 *vpp = dvp;
1268 1273 VN_HOLD(*vpp);
1269 1274 return (0);
1270 1275 }
1271 1276 return (error);
1272 1277 } else if (!zdp->z_zfsvfs->z_norm &&
1273 1278 (zdp->z_zfsvfs->z_case == ZFS_CASE_SENSITIVE)) {
1274 1279
1275 1280 vnode_t *tvp = dnlc_lookup(dvp, nm);
1276 1281
1277 1282 if (tvp) {
1278 1283 error = zfs_fastaccesschk_execute(zdp, cr);
1279 1284 if (error) {
1280 1285 VN_RELE(tvp);
1281 1286 return (error);
1282 1287 }
1283 1288 if (tvp == DNLC_NO_VNODE) {
1284 1289 VN_RELE(tvp);
1285 1290 return (SET_ERROR(ENOENT));
1286 1291 } else {
1287 1292 *vpp = tvp;
1288 1293 return (specvp_check(vpp, cr));
1289 1294 }
1290 1295 }
1291 1296 }
1292 1297 }
1293 1298
1294 1299 DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
1295 1300
1296 1301 ZFS_ENTER(zfsvfs);
1297 1302 ZFS_VERIFY_ZP(zdp);
1298 1303
1299 1304 *vpp = NULL;
1300 1305
1301 1306 if (flags & LOOKUP_XATTR) {
1302 1307 /*
1303 1308 * If the xattr property is off, refuse the lookup request.
1304 1309 */
1305 1310 if (!(zfsvfs->z_vfs->vfs_flag & VFS_XATTR)) {
1306 1311 ZFS_EXIT(zfsvfs);
1307 1312 return (SET_ERROR(EINVAL));
1308 1313 }
1309 1314
1310 1315 /*
1311 1316 * We don't allow recursive attributes..
1312 1317 * Maybe someday we will.
1313 1318 */
1314 1319 if (zdp->z_pflags & ZFS_XATTR) {
1315 1320 ZFS_EXIT(zfsvfs);
1316 1321 return (SET_ERROR(EINVAL));
1317 1322 }
1318 1323
1319 1324 if (error = zfs_get_xattrdir(VTOZ(dvp), vpp, cr, flags)) {
1320 1325 ZFS_EXIT(zfsvfs);
1321 1326 return (error);
1322 1327 }
1323 1328
1324 1329 /*
1325 1330 * Do we have permission to get into attribute directory?
1326 1331 */
1327 1332
1328 1333 if (error = zfs_zaccess(VTOZ(*vpp), ACE_EXECUTE, 0,
1329 1334 B_FALSE, cr)) {
1330 1335 VN_RELE(*vpp);
1331 1336 *vpp = NULL;
1332 1337 }
1333 1338
1334 1339 ZFS_EXIT(zfsvfs);
1335 1340 return (error);
1336 1341 }
1337 1342
1338 1343 if (dvp->v_type != VDIR) {
1339 1344 ZFS_EXIT(zfsvfs);
1340 1345 return (SET_ERROR(ENOTDIR));
1341 1346 }
1342 1347
1343 1348 /*
1344 1349 * Check accessibility of directory.
1345 1350 */
1346 1351
1347 1352 if (error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr)) {
1348 1353 ZFS_EXIT(zfsvfs);
1349 1354 return (error);
1350 1355 }
1351 1356
1352 1357 if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm),
1353 1358 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1354 1359 ZFS_EXIT(zfsvfs);
1355 1360 return (SET_ERROR(EILSEQ));
1356 1361 }
1357 1362
1358 1363 error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
1359 1364 if (error == 0)
1360 1365 error = specvp_check(vpp, cr);
1361 1366
1362 1367 ZFS_EXIT(zfsvfs);
1363 1368 return (error);
1364 1369 }
1365 1370
1366 1371 /*
1367 1372 * Attempt to create a new entry in a directory. If the entry
1368 1373 * already exists, truncate the file if permissible, else return
1369 1374 * an error. Return the vp of the created or trunc'd file.
1370 1375 *
1371 1376 * IN: dvp - vnode of directory to put new file entry in.
1372 1377 * name - name of new file entry.
1373 1378 * vap - attributes of new file.
1374 1379 * excl - flag indicating exclusive or non-exclusive mode.
1375 1380 * mode - mode to open file with.
1376 1381 * cr - credentials of caller.
1377 1382 * flag - large file flag [UNUSED].
1378 1383 * ct - caller context
1379 1384 * vsecp - ACL to be set
1380 1385 *
1381 1386 * OUT: vpp - vnode of created or trunc'd entry.
1382 1387 *
1383 1388 * RETURN: 0 on success, error code on failure.
1384 1389 *
1385 1390 * Timestamps:
1386 1391 * dvp - ctime|mtime updated if new entry created
1387 1392 * vp - ctime|mtime always, atime if new
1388 1393 */
1389 1394
1390 1395 /* ARGSUSED */
1391 1396 static int
1392 1397 zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
1393 1398 int mode, vnode_t **vpp, cred_t *cr, int flag, caller_context_t *ct,
1394 1399 vsecattr_t *vsecp)
1395 1400 {
1396 1401 znode_t *zp, *dzp = VTOZ(dvp);
1397 1402 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1398 1403 zilog_t *zilog;
1399 1404 objset_t *os;
1400 1405 zfs_dirlock_t *dl;
1401 1406 dmu_tx_t *tx;
1402 1407 int error;
1403 1408 ksid_t *ksid;
1404 1409 uid_t uid;
1405 1410 gid_t gid = crgetgid(cr);
1406 1411 zfs_acl_ids_t acl_ids;
1407 1412 boolean_t fuid_dirtied;
1408 1413 boolean_t have_acl = B_FALSE;
1409 1414 boolean_t waited = B_FALSE;
1410 1415
1411 1416 /*
1412 1417 * If we have an ephemeral id, ACL, or XVATTR then
1413 1418 * make sure file system is at proper version
1414 1419 */
1415 1420
1416 1421 ksid = crgetsid(cr, KSID_OWNER);
1417 1422 if (ksid)
1418 1423 uid = ksid_getid(ksid);
1419 1424 else
1420 1425 uid = crgetuid(cr);
1421 1426
1422 1427 if (zfsvfs->z_use_fuids == B_FALSE &&
1423 1428 (vsecp || (vap->va_mask & AT_XVATTR) ||
1424 1429 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1425 1430 return (SET_ERROR(EINVAL));
1426 1431
1427 1432 ZFS_ENTER(zfsvfs);
1428 1433 ZFS_VERIFY_ZP(dzp);
1429 1434 os = zfsvfs->z_os;
1430 1435 zilog = zfsvfs->z_log;
1431 1436
1432 1437 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
1433 1438 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1434 1439 ZFS_EXIT(zfsvfs);
1435 1440 return (SET_ERROR(EILSEQ));
1436 1441 }
1437 1442
1438 1443 if (vap->va_mask & AT_XVATTR) {
1439 1444 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1440 1445 crgetuid(cr), cr, vap->va_type)) != 0) {
1441 1446 ZFS_EXIT(zfsvfs);
1442 1447 return (error);
1443 1448 }
1444 1449 }
1445 1450 top:
1446 1451 *vpp = NULL;
1447 1452
1448 1453 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr))
1449 1454 vap->va_mode &= ~VSVTX;
1450 1455
1451 1456 if (*name == '\0') {
1452 1457 /*
1453 1458 * Null component name refers to the directory itself.
1454 1459 */
1455 1460 VN_HOLD(dvp);
1456 1461 zp = dzp;
1457 1462 dl = NULL;
1458 1463 error = 0;
1459 1464 } else {
1460 1465 /* possible VN_HOLD(zp) */
1461 1466 int zflg = 0;
1462 1467
1463 1468 if (flag & FIGNORECASE)
1464 1469 zflg |= ZCILOOK;
1465 1470
1466 1471 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1467 1472 NULL, NULL);
1468 1473 if (error) {
1469 1474 if (have_acl)
1470 1475 zfs_acl_ids_free(&acl_ids);
1471 1476 if (strcmp(name, "..") == 0)
1472 1477 error = SET_ERROR(EISDIR);
1473 1478 ZFS_EXIT(zfsvfs);
1474 1479 return (error);
1475 1480 }
1476 1481 }
1477 1482
1478 1483 if (zp == NULL) {
1479 1484 uint64_t txtype;
1480 1485
1481 1486 /*
1482 1487 * Create a new file object and update the directory
1483 1488 * to reference it.
1484 1489 */
1485 1490 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
1486 1491 if (have_acl)
1487 1492 zfs_acl_ids_free(&acl_ids);
1488 1493 goto out;
1489 1494 }
1490 1495
1491 1496 /*
1492 1497 * We only support the creation of regular files in
1493 1498 * extended attribute directories.
1494 1499 */
1495 1500
1496 1501 if ((dzp->z_pflags & ZFS_XATTR) &&
1497 1502 (vap->va_type != VREG)) {
1498 1503 if (have_acl)
1499 1504 zfs_acl_ids_free(&acl_ids);
1500 1505 error = SET_ERROR(EINVAL);
1501 1506 goto out;
1502 1507 }
1503 1508
1504 1509 if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap,
1505 1510 cr, vsecp, &acl_ids)) != 0)
1506 1511 goto out;
1507 1512 have_acl = B_TRUE;
1508 1513
1509 1514 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1510 1515 zfs_acl_ids_free(&acl_ids);
1511 1516 error = SET_ERROR(EDQUOT);
1512 1517 goto out;
1513 1518 }
1514 1519
1515 1520 tx = dmu_tx_create(os);
1516 1521
1517 1522 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1518 1523 ZFS_SA_BASE_ATTR_SIZE);
1519 1524
1520 1525 fuid_dirtied = zfsvfs->z_fuid_dirty;
1521 1526 if (fuid_dirtied)
1522 1527 zfs_fuid_txhold(zfsvfs, tx);
1523 1528 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
1524 1529 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
1525 1530 if (!zfsvfs->z_use_sa &&
1526 1531 acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1527 1532 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
1528 1533 0, acl_ids.z_aclp->z_acl_bytes);
1529 1534 }
1530 1535 error = dmu_tx_assign(tx,
1531 1536 (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1532 1537 if (error) {
1533 1538 zfs_dirent_unlock(dl);
1534 1539 if (error == ERESTART) {
1535 1540 waited = B_TRUE;
1536 1541 dmu_tx_wait(tx);
1537 1542 dmu_tx_abort(tx);
1538 1543 goto top;
1539 1544 }
1540 1545 zfs_acl_ids_free(&acl_ids);
1541 1546 dmu_tx_abort(tx);
1542 1547 ZFS_EXIT(zfsvfs);
1543 1548 return (error);
1544 1549 }
1545 1550 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
1546 1551
1547 1552 if (fuid_dirtied)
1548 1553 zfs_fuid_sync(zfsvfs, tx);
1549 1554
1550 1555 (void) zfs_link_create(dl, zp, tx, ZNEW);
1551 1556 txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
1552 1557 if (flag & FIGNORECASE)
1553 1558 txtype |= TX_CI;
1554 1559 zfs_log_create(zilog, tx, txtype, dzp, zp, name,
1555 1560 vsecp, acl_ids.z_fuidp, vap);
1556 1561 zfs_acl_ids_free(&acl_ids);
1557 1562 dmu_tx_commit(tx);
1558 1563 } else {
1559 1564 int aflags = (flag & FAPPEND) ? V_APPEND : 0;
1560 1565
1561 1566 if (have_acl)
1562 1567 zfs_acl_ids_free(&acl_ids);
1563 1568 have_acl = B_FALSE;
1564 1569
1565 1570 /*
1566 1571 * A directory entry already exists for this name.
1567 1572 */
1568 1573 /*
1569 1574 * Can't truncate an existing file if in exclusive mode.
1570 1575 */
1571 1576 if (excl == EXCL) {
1572 1577 error = SET_ERROR(EEXIST);
1573 1578 goto out;
1574 1579 }
1575 1580 /*
1576 1581 * Can't open a directory for writing.
1577 1582 */
1578 1583 if ((ZTOV(zp)->v_type == VDIR) && (mode & S_IWRITE)) {
1579 1584 error = SET_ERROR(EISDIR);
1580 1585 goto out;
1581 1586 }
1582 1587 /*
1583 1588 * Verify requested access to file.
1584 1589 */
1585 1590 if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) {
1586 1591 goto out;
1587 1592 }
1588 1593
1589 1594 mutex_enter(&dzp->z_lock);
1590 1595 dzp->z_seq++;
1591 1596 mutex_exit(&dzp->z_lock);
1592 1597
1593 1598 /*
1594 1599 * Truncate regular files if requested.
1595 1600 */
1596 1601 if ((ZTOV(zp)->v_type == VREG) &&
1597 1602 (vap->va_mask & AT_SIZE) && (vap->va_size == 0)) {
1598 1603 /* we can't hold any locks when calling zfs_freesp() */
1599 1604 zfs_dirent_unlock(dl);
1600 1605 dl = NULL;
1601 1606 error = zfs_freesp(zp, 0, 0, mode, TRUE);
1602 1607 if (error == 0) {
1603 1608 vnevent_create(ZTOV(zp), ct);
1604 1609 }
1605 1610 }
1606 1611 }
1607 1612 out:
1608 1613
1609 1614 if (dl)
1610 1615 zfs_dirent_unlock(dl);
1611 1616
1612 1617 if (error) {
1613 1618 if (zp)
1614 1619 VN_RELE(ZTOV(zp));
1615 1620 } else {
1616 1621 *vpp = ZTOV(zp);
1617 1622 error = specvp_check(vpp, cr);
1618 1623 }
1619 1624
1620 1625 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1621 1626 zil_commit(zilog, 0);
1622 1627
1623 1628 ZFS_EXIT(zfsvfs);
1624 1629 return (error);
1625 1630 }
1626 1631
1627 1632 /*
1628 1633 * Remove an entry from a directory.
1629 1634 *
1630 1635 * IN: dvp - vnode of directory to remove entry from.
1631 1636 * name - name of entry to remove.
1632 1637 * cr - credentials of caller.
1633 1638 * ct - caller context
1634 1639 * flags - case flags
1635 1640 *
1636 1641 * RETURN: 0 on success, error code on failure.
1637 1642 *
1638 1643 * Timestamps:
1639 1644 * dvp - ctime|mtime
1640 1645 * vp - ctime (if nlink > 0)
1641 1646 */
1642 1647
1643 1648 uint64_t null_xattr = 0;
1644 1649
1645 1650 /*ARGSUSED*/
1646 1651 static int
1647 1652 zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
1648 1653 int flags)
1649 1654 {
1650 1655 znode_t *zp, *dzp = VTOZ(dvp);
1651 1656 znode_t *xzp;
1652 1657 vnode_t *vp;
1653 1658 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1654 1659 zilog_t *zilog;
1655 1660 uint64_t acl_obj, xattr_obj;
1656 1661 uint64_t xattr_obj_unlinked = 0;
1657 1662 uint64_t obj = 0;
1658 1663 zfs_dirlock_t *dl;
1659 1664 dmu_tx_t *tx;
1660 1665 boolean_t may_delete_now, delete_now = FALSE;
1661 1666 boolean_t unlinked, toobig = FALSE;
1662 1667 uint64_t txtype;
1663 1668 pathname_t *realnmp = NULL;
1664 1669 pathname_t realnm;
1665 1670 int error;
1666 1671 int zflg = ZEXISTS;
1667 1672 boolean_t waited = B_FALSE;
1668 1673
1669 1674 ZFS_ENTER(zfsvfs);
1670 1675 ZFS_VERIFY_ZP(dzp);
1671 1676 zilog = zfsvfs->z_log;
1672 1677
1673 1678 if (flags & FIGNORECASE) {
1674 1679 zflg |= ZCILOOK;
1675 1680 pn_alloc(&realnm);
1676 1681 realnmp = &realnm;
1677 1682 }
1678 1683
1679 1684 top:
1680 1685 xattr_obj = 0;
1681 1686 xzp = NULL;
1682 1687 /*
1683 1688 * Attempt to lock directory; fail if entry doesn't exist.
1684 1689 */
1685 1690 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
1686 1691 NULL, realnmp)) {
1687 1692 if (realnmp)
1688 1693 pn_free(realnmp);
1689 1694 ZFS_EXIT(zfsvfs);
1690 1695 return (error);
1691 1696 }
1692 1697
1693 1698 vp = ZTOV(zp);
1694 1699
1695 1700 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
1696 1701 goto out;
1697 1702 }
1698 1703
1699 1704 /*
1700 1705 * Need to use rmdir for removing directories.
1701 1706 */
1702 1707 if (vp->v_type == VDIR) {
1703 1708 error = SET_ERROR(EPERM);
1704 1709 goto out;
1705 1710 }
1706 1711
1707 1712 vnevent_remove(vp, dvp, name, ct);
1708 1713
1709 1714 if (realnmp)
1710 1715 dnlc_remove(dvp, realnmp->pn_buf);
1711 1716 else
1712 1717 dnlc_remove(dvp, name);
1713 1718
1714 1719 mutex_enter(&vp->v_lock);
1715 1720 may_delete_now = vp->v_count == 1 && !vn_has_cached_data(vp);
1716 1721 mutex_exit(&vp->v_lock);
1717 1722
1718 1723 /*
1719 1724 * We may delete the znode now, or we may put it in the unlinked set;
1720 1725 * it depends on whether we're the last link, and on whether there are
1721 1726 * other holds on the vnode. So we dmu_tx_hold() the right things to
1722 1727 * allow for either case.
1723 1728 */
1724 1729 obj = zp->z_id;
1725 1730 tx = dmu_tx_create(zfsvfs->z_os);
1726 1731 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
1727 1732 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
1728 1733 zfs_sa_upgrade_txholds(tx, zp);
1729 1734 zfs_sa_upgrade_txholds(tx, dzp);
1730 1735 if (may_delete_now) {
1731 1736 toobig =
1732 1737 zp->z_size > zp->z_blksz * DMU_MAX_DELETEBLKCNT;
1733 1738 /* if the file is too big, only hold_free a token amount */
1734 1739 dmu_tx_hold_free(tx, zp->z_id, 0,
1735 1740 (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END));
1736 1741 }
1737 1742
1738 1743 /* are there any extended attributes? */
1739 1744 error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1740 1745 &xattr_obj, sizeof (xattr_obj));
1741 1746 if (error == 0 && xattr_obj) {
1742 1747 error = zfs_zget(zfsvfs, xattr_obj, &xzp);
1743 1748 ASSERT0(error);
1744 1749 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
1745 1750 dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
1746 1751 }
1747 1752
1748 1753 mutex_enter(&zp->z_lock);
1749 1754 if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now)
1750 1755 dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
1751 1756 mutex_exit(&zp->z_lock);
1752 1757
1753 1758 /* charge as an update -- would be nice not to charge at all */
1754 1759 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
1755 1760
1756 1761 /*
1757 1762 * Mark this transaction as typically resulting in a net free of space
1758 1763 */
1759 1764 dmu_tx_mark_netfree(tx);
1760 1765
1761 1766 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1762 1767 if (error) {
1763 1768 zfs_dirent_unlock(dl);
1764 1769 VN_RELE(vp);
1765 1770 if (xzp)
1766 1771 VN_RELE(ZTOV(xzp));
1767 1772 if (error == ERESTART) {
1768 1773 waited = B_TRUE;
1769 1774 dmu_tx_wait(tx);
1770 1775 dmu_tx_abort(tx);
1771 1776 goto top;
1772 1777 }
1773 1778 if (realnmp)
1774 1779 pn_free(realnmp);
1775 1780 dmu_tx_abort(tx);
1776 1781 ZFS_EXIT(zfsvfs);
1777 1782 return (error);
1778 1783 }
1779 1784
1780 1785 /*
1781 1786 * Remove the directory entry.
1782 1787 */
1783 1788 error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked);
1784 1789
1785 1790 if (error) {
1786 1791 dmu_tx_commit(tx);
1787 1792 goto out;
1788 1793 }
1789 1794
1790 1795 if (unlinked) {
1791 1796 /*
1792 1797 * Hold z_lock so that we can make sure that the ACL obj
1793 1798 * hasn't changed. Could have been deleted due to
1794 1799 * zfs_sa_upgrade().
1795 1800 */
1796 1801 mutex_enter(&zp->z_lock);
1797 1802 mutex_enter(&vp->v_lock);
1798 1803 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
1799 1804 &xattr_obj_unlinked, sizeof (xattr_obj_unlinked));
1800 1805 delete_now = may_delete_now && !toobig &&
1801 1806 vp->v_count == 1 && !vn_has_cached_data(vp) &&
1802 1807 xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) ==
1803 1808 acl_obj;
1804 1809 mutex_exit(&vp->v_lock);
1805 1810 }
1806 1811
1807 1812 if (delete_now) {
1808 1813 if (xattr_obj_unlinked) {
1809 1814 ASSERT3U(xzp->z_links, ==, 2);
1810 1815 mutex_enter(&xzp->z_lock);
1811 1816 xzp->z_unlinked = 1;
1812 1817 xzp->z_links = 0;
1813 1818 error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
1814 1819 &xzp->z_links, sizeof (xzp->z_links), tx);
1815 1820 ASSERT3U(error, ==, 0);
1816 1821 mutex_exit(&xzp->z_lock);
1817 1822 zfs_unlinked_add(xzp, tx);
1818 1823
1819 1824 if (zp->z_is_sa)
1820 1825 error = sa_remove(zp->z_sa_hdl,
1821 1826 SA_ZPL_XATTR(zfsvfs), tx);
1822 1827 else
1823 1828 error = sa_update(zp->z_sa_hdl,
1824 1829 SA_ZPL_XATTR(zfsvfs), &null_xattr,
1825 1830 sizeof (uint64_t), tx);
1826 1831 ASSERT0(error);
1827 1832 }
1828 1833 mutex_enter(&vp->v_lock);
1829 1834 VN_RELE_LOCKED(vp);
1830 1835 ASSERT0(vp->v_count);
1831 1836 mutex_exit(&vp->v_lock);
1832 1837 mutex_exit(&zp->z_lock);
1833 1838 zfs_znode_delete(zp, tx);
1834 1839 } else if (unlinked) {
1835 1840 mutex_exit(&zp->z_lock);
1836 1841 zfs_unlinked_add(zp, tx);
1837 1842 }
1838 1843
1839 1844 txtype = TX_REMOVE;
1840 1845 if (flags & FIGNORECASE)
1841 1846 txtype |= TX_CI;
1842 1847 zfs_log_remove(zilog, tx, txtype, dzp, name, obj);
1843 1848
1844 1849 dmu_tx_commit(tx);
1845 1850 out:
1846 1851 if (realnmp)
1847 1852 pn_free(realnmp);
1848 1853
1849 1854 zfs_dirent_unlock(dl);
1850 1855
1851 1856 if (!delete_now)
1852 1857 VN_RELE(vp);
1853 1858 if (xzp)
1854 1859 VN_RELE(ZTOV(xzp));
1855 1860
1856 1861 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
1857 1862 zil_commit(zilog, 0);
1858 1863
1859 1864 ZFS_EXIT(zfsvfs);
1860 1865 return (error);
1861 1866 }
1862 1867
1863 1868 /*
1864 1869 * Create a new directory and insert it into dvp using the name
1865 1870 * provided. Return a pointer to the inserted directory.
1866 1871 *
1867 1872 * IN: dvp - vnode of directory to add subdir to.
1868 1873 * dirname - name of new directory.
1869 1874 * vap - attributes of new directory.
1870 1875 * cr - credentials of caller.
1871 1876 * ct - caller context
1872 1877 * flags - case flags
1873 1878 * vsecp - ACL to be set
1874 1879 *
1875 1880 * OUT: vpp - vnode of created directory.
1876 1881 *
1877 1882 * RETURN: 0 on success, error code on failure.
1878 1883 *
1879 1884 * Timestamps:
1880 1885 * dvp - ctime|mtime updated
1881 1886 * vp - ctime|mtime|atime updated
1882 1887 */
1883 1888 /*ARGSUSED*/
1884 1889 static int
1885 1890 zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
1886 1891 caller_context_t *ct, int flags, vsecattr_t *vsecp)
1887 1892 {
1888 1893 znode_t *zp, *dzp = VTOZ(dvp);
1889 1894 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
1890 1895 zilog_t *zilog;
1891 1896 zfs_dirlock_t *dl;
1892 1897 uint64_t txtype;
1893 1898 dmu_tx_t *tx;
1894 1899 int error;
1895 1900 int zf = ZNEW;
1896 1901 ksid_t *ksid;
1897 1902 uid_t uid;
1898 1903 gid_t gid = crgetgid(cr);
1899 1904 zfs_acl_ids_t acl_ids;
1900 1905 boolean_t fuid_dirtied;
1901 1906 boolean_t waited = B_FALSE;
1902 1907
1903 1908 ASSERT(vap->va_type == VDIR);
1904 1909
1905 1910 /*
1906 1911 * If we have an ephemeral id, ACL, or XVATTR then
1907 1912 * make sure file system is at proper version
1908 1913 */
1909 1914
1910 1915 ksid = crgetsid(cr, KSID_OWNER);
1911 1916 if (ksid)
1912 1917 uid = ksid_getid(ksid);
1913 1918 else
1914 1919 uid = crgetuid(cr);
1915 1920 if (zfsvfs->z_use_fuids == B_FALSE &&
1916 1921 (vsecp || (vap->va_mask & AT_XVATTR) ||
1917 1922 IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid)))
1918 1923 return (SET_ERROR(EINVAL));
1919 1924
1920 1925 ZFS_ENTER(zfsvfs);
1921 1926 ZFS_VERIFY_ZP(dzp);
1922 1927 zilog = zfsvfs->z_log;
1923 1928
1924 1929 if (dzp->z_pflags & ZFS_XATTR) {
1925 1930 ZFS_EXIT(zfsvfs);
1926 1931 return (SET_ERROR(EINVAL));
1927 1932 }
1928 1933
1929 1934 if (zfsvfs->z_utf8 && u8_validate(dirname,
1930 1935 strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
1931 1936 ZFS_EXIT(zfsvfs);
1932 1937 return (SET_ERROR(EILSEQ));
1933 1938 }
1934 1939 if (flags & FIGNORECASE)
1935 1940 zf |= ZCILOOK;
1936 1941
1937 1942 if (vap->va_mask & AT_XVATTR) {
1938 1943 if ((error = secpolicy_xvattr((xvattr_t *)vap,
1939 1944 crgetuid(cr), cr, vap->va_type)) != 0) {
1940 1945 ZFS_EXIT(zfsvfs);
1941 1946 return (error);
1942 1947 }
1943 1948 }
1944 1949
1945 1950 if ((error = zfs_acl_ids_create(dzp, 0, vap, cr,
1946 1951 vsecp, &acl_ids)) != 0) {
1947 1952 ZFS_EXIT(zfsvfs);
1948 1953 return (error);
1949 1954 }
1950 1955 /*
1951 1956 * First make sure the new directory doesn't exist.
1952 1957 *
1953 1958 * Existence is checked first to make sure we don't return
1954 1959 * EACCES instead of EEXIST which can cause some applications
1955 1960 * to fail.
1956 1961 */
1957 1962 top:
1958 1963 *vpp = NULL;
1959 1964
1960 1965 if (error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf,
1961 1966 NULL, NULL)) {
1962 1967 zfs_acl_ids_free(&acl_ids);
1963 1968 ZFS_EXIT(zfsvfs);
1964 1969 return (error);
1965 1970 }
1966 1971
1967 1972 if (error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr)) {
1968 1973 zfs_acl_ids_free(&acl_ids);
1969 1974 zfs_dirent_unlock(dl);
1970 1975 ZFS_EXIT(zfsvfs);
1971 1976 return (error);
1972 1977 }
1973 1978
1974 1979 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
1975 1980 zfs_acl_ids_free(&acl_ids);
1976 1981 zfs_dirent_unlock(dl);
1977 1982 ZFS_EXIT(zfsvfs);
1978 1983 return (SET_ERROR(EDQUOT));
1979 1984 }
1980 1985
1981 1986 /*
1982 1987 * Add a new entry to the directory.
1983 1988 */
1984 1989 tx = dmu_tx_create(zfsvfs->z_os);
1985 1990 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
1986 1991 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
1987 1992 fuid_dirtied = zfsvfs->z_fuid_dirty;
1988 1993 if (fuid_dirtied)
1989 1994 zfs_fuid_txhold(zfsvfs, tx);
1990 1995 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
1991 1996 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
1992 1997 acl_ids.z_aclp->z_acl_bytes);
1993 1998 }
1994 1999
1995 2000 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
1996 2001 ZFS_SA_BASE_ATTR_SIZE);
1997 2002
1998 2003 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
1999 2004 if (error) {
2000 2005 zfs_dirent_unlock(dl);
2001 2006 if (error == ERESTART) {
2002 2007 waited = B_TRUE;
2003 2008 dmu_tx_wait(tx);
2004 2009 dmu_tx_abort(tx);
2005 2010 goto top;
2006 2011 }
2007 2012 zfs_acl_ids_free(&acl_ids);
2008 2013 dmu_tx_abort(tx);
2009 2014 ZFS_EXIT(zfsvfs);
2010 2015 return (error);
2011 2016 }
2012 2017
2013 2018 /*
2014 2019 * Create new node.
2015 2020 */
2016 2021 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
2017 2022
2018 2023 if (fuid_dirtied)
2019 2024 zfs_fuid_sync(zfsvfs, tx);
2020 2025
2021 2026 /*
2022 2027 * Now put new name in parent dir.
2023 2028 */
2024 2029 (void) zfs_link_create(dl, zp, tx, ZNEW);
2025 2030
2026 2031 *vpp = ZTOV(zp);
2027 2032
2028 2033 txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
2029 2034 if (flags & FIGNORECASE)
2030 2035 txtype |= TX_CI;
2031 2036 zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
2032 2037 acl_ids.z_fuidp, vap);
2033 2038
2034 2039 zfs_acl_ids_free(&acl_ids);
2035 2040
2036 2041 dmu_tx_commit(tx);
2037 2042
2038 2043 zfs_dirent_unlock(dl);
2039 2044
2040 2045 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2041 2046 zil_commit(zilog, 0);
2042 2047
2043 2048 ZFS_EXIT(zfsvfs);
2044 2049 return (0);
2045 2050 }
2046 2051
2047 2052 /*
2048 2053 * Remove a directory subdir entry. If the current working
2049 2054 * directory is the same as the subdir to be removed, the
2050 2055 * remove will fail.
2051 2056 *
2052 2057 * IN: dvp - vnode of directory to remove from.
2053 2058 * name - name of directory to be removed.
2054 2059 * cwd - vnode of current working directory.
2055 2060 * cr - credentials of caller.
2056 2061 * ct - caller context
2057 2062 * flags - case flags
2058 2063 *
2059 2064 * RETURN: 0 on success, error code on failure.
2060 2065 *
2061 2066 * Timestamps:
2062 2067 * dvp - ctime|mtime updated
2063 2068 */
2064 2069 /*ARGSUSED*/
2065 2070 static int
2066 2071 zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
2067 2072 caller_context_t *ct, int flags)
2068 2073 {
2069 2074 znode_t *dzp = VTOZ(dvp);
2070 2075 znode_t *zp;
2071 2076 vnode_t *vp;
2072 2077 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
2073 2078 zilog_t *zilog;
2074 2079 zfs_dirlock_t *dl;
2075 2080 dmu_tx_t *tx;
2076 2081 int error;
2077 2082 int zflg = ZEXISTS;
2078 2083 boolean_t waited = B_FALSE;
2079 2084
2080 2085 ZFS_ENTER(zfsvfs);
2081 2086 ZFS_VERIFY_ZP(dzp);
2082 2087 zilog = zfsvfs->z_log;
2083 2088
2084 2089 if (flags & FIGNORECASE)
2085 2090 zflg |= ZCILOOK;
2086 2091 top:
2087 2092 zp = NULL;
2088 2093
2089 2094 /*
2090 2095 * Attempt to lock directory; fail if entry doesn't exist.
2091 2096 */
2092 2097 if (error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg,
2093 2098 NULL, NULL)) {
2094 2099 ZFS_EXIT(zfsvfs);
2095 2100 return (error);
2096 2101 }
2097 2102
2098 2103 vp = ZTOV(zp);
2099 2104
2100 2105 if (error = zfs_zaccess_delete(dzp, zp, cr)) {
2101 2106 goto out;
2102 2107 }
2103 2108
2104 2109 if (vp->v_type != VDIR) {
2105 2110 error = SET_ERROR(ENOTDIR);
2106 2111 goto out;
2107 2112 }
2108 2113
2109 2114 if (vp == cwd) {
2110 2115 error = SET_ERROR(EINVAL);
2111 2116 goto out;
2112 2117 }
2113 2118
2114 2119 vnevent_rmdir(vp, dvp, name, ct);
2115 2120
2116 2121 /*
2117 2122 * Grab a lock on the directory to make sure that noone is
2118 2123 * trying to add (or lookup) entries while we are removing it.
2119 2124 */
2120 2125 rw_enter(&zp->z_name_lock, RW_WRITER);
2121 2126
2122 2127 /*
2123 2128 * Grab a lock on the parent pointer to make sure we play well
2124 2129 * with the treewalk and directory rename code.
2125 2130 */
2126 2131 rw_enter(&zp->z_parent_lock, RW_WRITER);
2127 2132
2128 2133 tx = dmu_tx_create(zfsvfs->z_os);
2129 2134 dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
2130 2135 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
2131 2136 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
2132 2137 zfs_sa_upgrade_txholds(tx, zp);
2133 2138 zfs_sa_upgrade_txholds(tx, dzp);
2134 2139 dmu_tx_mark_netfree(tx);
2135 2140 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
2136 2141 if (error) {
2137 2142 rw_exit(&zp->z_parent_lock);
2138 2143 rw_exit(&zp->z_name_lock);
2139 2144 zfs_dirent_unlock(dl);
2140 2145 VN_RELE(vp);
2141 2146 if (error == ERESTART) {
2142 2147 waited = B_TRUE;
2143 2148 dmu_tx_wait(tx);
2144 2149 dmu_tx_abort(tx);
2145 2150 goto top;
2146 2151 }
2147 2152 dmu_tx_abort(tx);
2148 2153 ZFS_EXIT(zfsvfs);
2149 2154 return (error);
2150 2155 }
2151 2156
2152 2157 error = zfs_link_destroy(dl, zp, tx, zflg, NULL);
2153 2158
2154 2159 if (error == 0) {
2155 2160 uint64_t txtype = TX_RMDIR;
2156 2161 if (flags & FIGNORECASE)
2157 2162 txtype |= TX_CI;
2158 2163 zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT);
2159 2164 }
2160 2165
2161 2166 dmu_tx_commit(tx);
2162 2167
2163 2168 rw_exit(&zp->z_parent_lock);
2164 2169 rw_exit(&zp->z_name_lock);
2165 2170 out:
2166 2171 zfs_dirent_unlock(dl);
2167 2172
2168 2173 VN_RELE(vp);
2169 2174
2170 2175 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
2171 2176 zil_commit(zilog, 0);
2172 2177
2173 2178 ZFS_EXIT(zfsvfs);
2174 2179 return (error);
2175 2180 }
2176 2181
2177 2182 /*
2178 2183 * Read as many directory entries as will fit into the provided
2179 2184 * buffer from the given directory cursor position (specified in
2180 2185 * the uio structure).
2181 2186 *
2182 2187 * IN: vp - vnode of directory to read.
2183 2188 * uio - structure supplying read location, range info,
2184 2189 * and return buffer.
2185 2190 * cr - credentials of caller.
2186 2191 * ct - caller context
2187 2192 * flags - case flags
2188 2193 *
2189 2194 * OUT: uio - updated offset and range, buffer filled.
2190 2195 * eofp - set to true if end-of-file detected.
2191 2196 *
2192 2197 * RETURN: 0 on success, error code on failure.
2193 2198 *
2194 2199 * Timestamps:
2195 2200 * vp - atime updated
2196 2201 *
2197 2202 * Note that the low 4 bits of the cookie returned by zap is always zero.
2198 2203 * This allows us to use the low range for "special" directory entries:
2199 2204 * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem,
2200 2205 * we use the offset 2 for the '.zfs' directory.
2201 2206 */
2202 2207 /* ARGSUSED */
2203 2208 static int
2204 2209 zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
2205 2210 caller_context_t *ct, int flags)
2206 2211 {
2207 2212 znode_t *zp = VTOZ(vp);
2208 2213 iovec_t *iovp;
2209 2214 edirent_t *eodp;
2210 2215 dirent64_t *odp;
2211 2216 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2212 2217 objset_t *os;
2213 2218 caddr_t outbuf;
2214 2219 size_t bufsize;
2215 2220 zap_cursor_t zc;
2216 2221 zap_attribute_t zap;
2217 2222 uint_t bytes_wanted;
2218 2223 uint64_t offset; /* must be unsigned; checks for < 1 */
2219 2224 uint64_t parent;
2220 2225 int local_eof;
2221 2226 int outcount;
2222 2227 int error;
2223 2228 uint8_t prefetch;
2224 2229 boolean_t check_sysattrs;
2225 2230
2226 2231 ZFS_ENTER(zfsvfs);
2227 2232 ZFS_VERIFY_ZP(zp);
2228 2233
2229 2234 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
2230 2235 &parent, sizeof (parent))) != 0) {
2231 2236 ZFS_EXIT(zfsvfs);
2232 2237 return (error);
2233 2238 }
2234 2239
2235 2240 /*
2236 2241 * If we are not given an eof variable,
2237 2242 * use a local one.
2238 2243 */
2239 2244 if (eofp == NULL)
2240 2245 eofp = &local_eof;
2241 2246
2242 2247 /*
2243 2248 * Check for valid iov_len.
2244 2249 */
2245 2250 if (uio->uio_iov->iov_len <= 0) {
2246 2251 ZFS_EXIT(zfsvfs);
2247 2252 return (SET_ERROR(EINVAL));
2248 2253 }
2249 2254
2250 2255 /*
2251 2256 * Quit if directory has been removed (posix)
2252 2257 */
2253 2258 if ((*eofp = zp->z_unlinked) != 0) {
2254 2259 ZFS_EXIT(zfsvfs);
2255 2260 return (0);
2256 2261 }
2257 2262
2258 2263 error = 0;
2259 2264 os = zfsvfs->z_os;
2260 2265 offset = uio->uio_loffset;
2261 2266 prefetch = zp->z_zn_prefetch;
2262 2267
2263 2268 /*
2264 2269 * Initialize the iterator cursor.
2265 2270 */
2266 2271 if (offset <= 3) {
2267 2272 /*
2268 2273 * Start iteration from the beginning of the directory.
2269 2274 */
2270 2275 zap_cursor_init(&zc, os, zp->z_id);
2271 2276 } else {
2272 2277 /*
2273 2278 * The offset is a serialized cursor.
2274 2279 */
2275 2280 zap_cursor_init_serialized(&zc, os, zp->z_id, offset);
2276 2281 }
2277 2282
2278 2283 /*
2279 2284 * Get space to change directory entries into fs independent format.
2280 2285 */
2281 2286 iovp = uio->uio_iov;
2282 2287 bytes_wanted = iovp->iov_len;
2283 2288 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) {
2284 2289 bufsize = bytes_wanted;
2285 2290 outbuf = kmem_alloc(bufsize, KM_SLEEP);
2286 2291 odp = (struct dirent64 *)outbuf;
2287 2292 } else {
2288 2293 bufsize = bytes_wanted;
2289 2294 outbuf = NULL;
2290 2295 odp = (struct dirent64 *)iovp->iov_base;
2291 2296 }
2292 2297 eodp = (struct edirent *)odp;
2293 2298
2294 2299 /*
2295 2300 * If this VFS supports the system attribute view interface; and
2296 2301 * we're looking at an extended attribute directory; and we care
2297 2302 * about normalization conflicts on this vfs; then we must check
2298 2303 * for normalization conflicts with the sysattr name space.
2299 2304 */
2300 2305 check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
2301 2306 (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm &&
2302 2307 (flags & V_RDDIR_ENTFLAGS);
2303 2308
2304 2309 /*
2305 2310 * Transform to file-system independent format
2306 2311 */
2307 2312 outcount = 0;
2308 2313 while (outcount < bytes_wanted) {
2309 2314 ino64_t objnum;
2310 2315 ushort_t reclen;
2311 2316 off64_t *next = NULL;
2312 2317
2313 2318 /*
2314 2319 * Special case `.', `..', and `.zfs'.
2315 2320 */
2316 2321 if (offset == 0) {
2317 2322 (void) strcpy(zap.za_name, ".");
2318 2323 zap.za_normalization_conflict = 0;
2319 2324 objnum = zp->z_id;
2320 2325 } else if (offset == 1) {
2321 2326 (void) strcpy(zap.za_name, "..");
2322 2327 zap.za_normalization_conflict = 0;
2323 2328 objnum = parent;
2324 2329 } else if (offset == 2 && zfs_show_ctldir(zp)) {
2325 2330 (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME);
2326 2331 zap.za_normalization_conflict = 0;
2327 2332 objnum = ZFSCTL_INO_ROOT;
2328 2333 } else {
2329 2334 /*
2330 2335 * Grab next entry.
2331 2336 */
2332 2337 if (error = zap_cursor_retrieve(&zc, &zap)) {
2333 2338 if ((*eofp = (error == ENOENT)) != 0)
2334 2339 break;
2335 2340 else
2336 2341 goto update;
2337 2342 }
2338 2343
2339 2344 if (zap.za_integer_length != 8 ||
2340 2345 zap.za_num_integers != 1) {
2341 2346 cmn_err(CE_WARN, "zap_readdir: bad directory "
2342 2347 "entry, obj = %lld, offset = %lld\n",
2343 2348 (u_longlong_t)zp->z_id,
2344 2349 (u_longlong_t)offset);
2345 2350 error = SET_ERROR(ENXIO);
2346 2351 goto update;
2347 2352 }
2348 2353
2349 2354 objnum = ZFS_DIRENT_OBJ(zap.za_first_integer);
2350 2355 /*
2351 2356 * MacOS X can extract the object type here such as:
2352 2357 * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer);
2353 2358 */
2354 2359
2355 2360 if (check_sysattrs && !zap.za_normalization_conflict) {
2356 2361 zap.za_normalization_conflict =
2357 2362 xattr_sysattr_casechk(zap.za_name);
2358 2363 }
2359 2364 }
2360 2365
2361 2366 if (flags & V_RDDIR_ACCFILTER) {
2362 2367 /*
2363 2368 * If we have no access at all, don't include
2364 2369 * this entry in the returned information
2365 2370 */
2366 2371 znode_t *ezp;
2367 2372 if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
2368 2373 goto skip_entry;
2369 2374 if (!zfs_has_access(ezp, cr)) {
2370 2375 VN_RELE(ZTOV(ezp));
2371 2376 goto skip_entry;
2372 2377 }
2373 2378 VN_RELE(ZTOV(ezp));
2374 2379 }
2375 2380
2376 2381 if (flags & V_RDDIR_ENTFLAGS)
2377 2382 reclen = EDIRENT_RECLEN(strlen(zap.za_name));
2378 2383 else
2379 2384 reclen = DIRENT64_RECLEN(strlen(zap.za_name));
2380 2385
2381 2386 /*
2382 2387 * Will this entry fit in the buffer?
2383 2388 */
2384 2389 if (outcount + reclen > bufsize) {
2385 2390 /*
2386 2391 * Did we manage to fit anything in the buffer?
2387 2392 */
2388 2393 if (!outcount) {
2389 2394 error = SET_ERROR(EINVAL);
2390 2395 goto update;
2391 2396 }
2392 2397 break;
2393 2398 }
2394 2399 if (flags & V_RDDIR_ENTFLAGS) {
2395 2400 /*
2396 2401 * Add extended flag entry:
2397 2402 */
2398 2403 eodp->ed_ino = objnum;
2399 2404 eodp->ed_reclen = reclen;
2400 2405 /* NOTE: ed_off is the offset for the *next* entry */
2401 2406 next = &(eodp->ed_off);
2402 2407 eodp->ed_eflags = zap.za_normalization_conflict ?
2403 2408 ED_CASE_CONFLICT : 0;
2404 2409 (void) strncpy(eodp->ed_name, zap.za_name,
2405 2410 EDIRENT_NAMELEN(reclen));
2406 2411 eodp = (edirent_t *)((intptr_t)eodp + reclen);
2407 2412 } else {
2408 2413 /*
2409 2414 * Add normal entry:
2410 2415 */
2411 2416 odp->d_ino = objnum;
2412 2417 odp->d_reclen = reclen;
2413 2418 /* NOTE: d_off is the offset for the *next* entry */
2414 2419 next = &(odp->d_off);
2415 2420 (void) strncpy(odp->d_name, zap.za_name,
2416 2421 DIRENT64_NAMELEN(reclen));
2417 2422 odp = (dirent64_t *)((intptr_t)odp + reclen);
2418 2423 }
2419 2424 outcount += reclen;
2420 2425
2421 2426 ASSERT(outcount <= bufsize);
2422 2427
2423 2428 /* Prefetch znode */
2424 2429 if (prefetch)
2425 2430 dmu_prefetch(os, objnum, 0, 0, 0,
2426 2431 ZIO_PRIORITY_SYNC_READ);
2427 2432
2428 2433 skip_entry:
2429 2434 /*
2430 2435 * Move to the next entry, fill in the previous offset.
2431 2436 */
2432 2437 if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) {
2433 2438 zap_cursor_advance(&zc);
2434 2439 offset = zap_cursor_serialize(&zc);
2435 2440 } else {
2436 2441 offset += 1;
2437 2442 }
2438 2443 if (next)
2439 2444 *next = offset;
2440 2445 }
2441 2446 zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */
2442 2447
2443 2448 if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) {
2444 2449 iovp->iov_base += outcount;
2445 2450 iovp->iov_len -= outcount;
2446 2451 uio->uio_resid -= outcount;
2447 2452 } else if (error = uiomove(outbuf, (long)outcount, UIO_READ, uio)) {
2448 2453 /*
2449 2454 * Reset the pointer.
2450 2455 */
2451 2456 offset = uio->uio_loffset;
2452 2457 }
2453 2458
2454 2459 update:
2455 2460 zap_cursor_fini(&zc);
2456 2461 if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1)
2457 2462 kmem_free(outbuf, bufsize);
2458 2463
2459 2464 if (error == ENOENT)
2460 2465 error = 0;
2461 2466
2462 2467 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
2463 2468
2464 2469 uio->uio_loffset = offset;
2465 2470 ZFS_EXIT(zfsvfs);
2466 2471 return (error);
2467 2472 }
2468 2473
2469 2474 ulong_t zfs_fsync_sync_cnt = 4;
2470 2475
2471 2476 static int
2472 2477 zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct)
2473 2478 {
2474 2479 znode_t *zp = VTOZ(vp);
2475 2480 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2476 2481
2477 2482 /*
2478 2483 * Regardless of whether this is required for standards conformance,
2479 2484 * this is the logical behavior when fsync() is called on a file with
2480 2485 * dirty pages. We use B_ASYNC since the ZIL transactions are already
2481 2486 * going to be pushed out as part of the zil_commit().
2482 2487 */
2483 2488 if (vn_has_cached_data(vp) && !(syncflag & FNODSYNC) &&
2484 2489 (vp->v_type == VREG) && !(IS_SWAPVP(vp)))
2485 2490 (void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_ASYNC, cr, ct);
2486 2491
2487 2492 (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt);
2488 2493
2489 2494 if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) {
2490 2495 ZFS_ENTER(zfsvfs);
2491 2496 ZFS_VERIFY_ZP(zp);
2492 2497 zil_commit(zfsvfs->z_log, zp->z_id);
2493 2498 ZFS_EXIT(zfsvfs);
2494 2499 }
2495 2500 return (0);
2496 2501 }
2497 2502
2498 2503
2499 2504 /*
2500 2505 * Get the requested file attributes and place them in the provided
2501 2506 * vattr structure.
2502 2507 *
2503 2508 * IN: vp - vnode of file.
2504 2509 * vap - va_mask identifies requested attributes.
2505 2510 * If AT_XVATTR set, then optional attrs are requested
2506 2511 * flags - ATTR_NOACLCHECK (CIFS server context)
2507 2512 * cr - credentials of caller.
2508 2513 * ct - caller context
2509 2514 *
2510 2515 * OUT: vap - attribute values.
2511 2516 *
2512 2517 * RETURN: 0 (always succeeds).
2513 2518 */
2514 2519 /* ARGSUSED */
2515 2520 static int
2516 2521 zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2517 2522 caller_context_t *ct)
2518 2523 {
2519 2524 znode_t *zp = VTOZ(vp);
2520 2525 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2521 2526 int error = 0;
2522 2527 uint64_t links;
2523 2528 uint64_t mtime[2], ctime[2];
2524 2529 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2525 2530 xoptattr_t *xoap = NULL;
2526 2531 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2527 2532 sa_bulk_attr_t bulk[2];
2528 2533 int count = 0;
2529 2534
2530 2535 ZFS_ENTER(zfsvfs);
2531 2536 ZFS_VERIFY_ZP(zp);
2532 2537
2533 2538 zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
2534 2539
2535 2540 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16);
2536 2541 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16);
2537 2542
2538 2543 if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) {
2539 2544 ZFS_EXIT(zfsvfs);
2540 2545 return (error);
2541 2546 }
2542 2547
2543 2548 /*
2544 2549 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
2545 2550 * Also, if we are the owner don't bother, since owner should
2546 2551 * always be allowed to read basic attributes of file.
2547 2552 */
2548 2553 if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) &&
2549 2554 (vap->va_uid != crgetuid(cr))) {
2550 2555 if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
2551 2556 skipaclchk, cr)) {
2552 2557 ZFS_EXIT(zfsvfs);
2553 2558 return (error);
2554 2559 }
2555 2560 }
2556 2561
2557 2562 /*
2558 2563 * Return all attributes. It's cheaper to provide the answer
2559 2564 * than to determine whether we were asked the question.
2560 2565 */
2561 2566
2562 2567 mutex_enter(&zp->z_lock);
2563 2568 vap->va_type = vp->v_type;
2564 2569 vap->va_mode = zp->z_mode & MODEMASK;
2565 2570 vap->va_fsid = zp->z_zfsvfs->z_vfs->vfs_dev;
2566 2571 vap->va_nodeid = zp->z_id;
2567 2572 if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp))
2568 2573 links = zp->z_links + 1;
2569 2574 else
2570 2575 links = zp->z_links;
2571 2576 vap->va_nlink = MIN(links, UINT32_MAX); /* nlink_t limit! */
2572 2577 vap->va_size = zp->z_size;
2573 2578 vap->va_rdev = vp->v_rdev;
2574 2579 vap->va_seq = zp->z_seq;
2575 2580
2576 2581 /*
2577 2582 * Add in any requested optional attributes and the create time.
2578 2583 * Also set the corresponding bits in the returned attribute bitmap.
2579 2584 */
2580 2585 if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) {
2581 2586 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) {
2582 2587 xoap->xoa_archive =
2583 2588 ((zp->z_pflags & ZFS_ARCHIVE) != 0);
2584 2589 XVA_SET_RTN(xvap, XAT_ARCHIVE);
2585 2590 }
2586 2591
2587 2592 if (XVA_ISSET_REQ(xvap, XAT_READONLY)) {
2588 2593 xoap->xoa_readonly =
2589 2594 ((zp->z_pflags & ZFS_READONLY) != 0);
2590 2595 XVA_SET_RTN(xvap, XAT_READONLY);
2591 2596 }
2592 2597
2593 2598 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) {
2594 2599 xoap->xoa_system =
2595 2600 ((zp->z_pflags & ZFS_SYSTEM) != 0);
2596 2601 XVA_SET_RTN(xvap, XAT_SYSTEM);
2597 2602 }
2598 2603
2599 2604 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) {
2600 2605 xoap->xoa_hidden =
2601 2606 ((zp->z_pflags & ZFS_HIDDEN) != 0);
2602 2607 XVA_SET_RTN(xvap, XAT_HIDDEN);
2603 2608 }
2604 2609
2605 2610 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2606 2611 xoap->xoa_nounlink =
2607 2612 ((zp->z_pflags & ZFS_NOUNLINK) != 0);
2608 2613 XVA_SET_RTN(xvap, XAT_NOUNLINK);
2609 2614 }
2610 2615
2611 2616 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2612 2617 xoap->xoa_immutable =
2613 2618 ((zp->z_pflags & ZFS_IMMUTABLE) != 0);
2614 2619 XVA_SET_RTN(xvap, XAT_IMMUTABLE);
2615 2620 }
2616 2621
2617 2622 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2618 2623 xoap->xoa_appendonly =
2619 2624 ((zp->z_pflags & ZFS_APPENDONLY) != 0);
2620 2625 XVA_SET_RTN(xvap, XAT_APPENDONLY);
2621 2626 }
2622 2627
2623 2628 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2624 2629 xoap->xoa_nodump =
2625 2630 ((zp->z_pflags & ZFS_NODUMP) != 0);
2626 2631 XVA_SET_RTN(xvap, XAT_NODUMP);
2627 2632 }
2628 2633
2629 2634 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) {
2630 2635 xoap->xoa_opaque =
2631 2636 ((zp->z_pflags & ZFS_OPAQUE) != 0);
2632 2637 XVA_SET_RTN(xvap, XAT_OPAQUE);
2633 2638 }
2634 2639
2635 2640 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2636 2641 xoap->xoa_av_quarantined =
2637 2642 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0);
2638 2643 XVA_SET_RTN(xvap, XAT_AV_QUARANTINED);
2639 2644 }
2640 2645
2641 2646 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2642 2647 xoap->xoa_av_modified =
2643 2648 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0);
2644 2649 XVA_SET_RTN(xvap, XAT_AV_MODIFIED);
2645 2650 }
2646 2651
2647 2652 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) &&
2648 2653 vp->v_type == VREG) {
2649 2654 zfs_sa_get_scanstamp(zp, xvap);
2650 2655 }
2651 2656
2652 2657 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) {
2653 2658 uint64_t times[2];
2654 2659
2655 2660 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs),
2656 2661 times, sizeof (times));
2657 2662 ZFS_TIME_DECODE(&xoap->xoa_createtime, times);
2658 2663 XVA_SET_RTN(xvap, XAT_CREATETIME);
2659 2664 }
2660 2665
2661 2666 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2662 2667 xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0);
2663 2668 XVA_SET_RTN(xvap, XAT_REPARSE);
2664 2669 }
2665 2670 if (XVA_ISSET_REQ(xvap, XAT_GEN)) {
2666 2671 xoap->xoa_generation = zp->z_gen;
2667 2672 XVA_SET_RTN(xvap, XAT_GEN);
2668 2673 }
2669 2674
2670 2675 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) {
2671 2676 xoap->xoa_offline =
2672 2677 ((zp->z_pflags & ZFS_OFFLINE) != 0);
2673 2678 XVA_SET_RTN(xvap, XAT_OFFLINE);
2674 2679 }
2675 2680
2676 2681 if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) {
2677 2682 xoap->xoa_sparse =
2678 2683 ((zp->z_pflags & ZFS_SPARSE) != 0);
2679 2684 XVA_SET_RTN(xvap, XAT_SPARSE);
2680 2685 }
2681 2686 }
2682 2687
2683 2688 ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime);
2684 2689 ZFS_TIME_DECODE(&vap->va_mtime, mtime);
2685 2690 ZFS_TIME_DECODE(&vap->va_ctime, ctime);
2686 2691
2687 2692 mutex_exit(&zp->z_lock);
2688 2693
2689 2694 sa_object_size(zp->z_sa_hdl, &vap->va_blksize, &vap->va_nblocks);
2690 2695
2691 2696 if (zp->z_blksz == 0) {
2692 2697 /*
2693 2698 * Block size hasn't been set; suggest maximal I/O transfers.
2694 2699 */
2695 2700 vap->va_blksize = zfsvfs->z_max_blksz;
2696 2701 }
2697 2702
2698 2703 ZFS_EXIT(zfsvfs);
2699 2704 return (0);
2700 2705 }
2701 2706
2702 2707 /*
2703 2708 * Set the file attributes to the values contained in the
2704 2709 * vattr structure.
2705 2710 *
2706 2711 * IN: vp - vnode of file to be modified.
2707 2712 * vap - new attribute values.
2708 2713 * If AT_XVATTR set, then optional attrs are being set
2709 2714 * flags - ATTR_UTIME set if non-default time values provided.
2710 2715 * - ATTR_NOACLCHECK (CIFS context only).
2711 2716 * cr - credentials of caller.
2712 2717 * ct - caller context
2713 2718 *
2714 2719 * RETURN: 0 on success, error code on failure.
2715 2720 *
2716 2721 * Timestamps:
2717 2722 * vp - ctime updated, mtime updated if size changed.
2718 2723 */
2719 2724 /* ARGSUSED */
2720 2725 static int
2721 2726 zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
2722 2727 caller_context_t *ct)
2723 2728 {
2724 2729 znode_t *zp = VTOZ(vp);
2725 2730 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
2726 2731 zilog_t *zilog;
2727 2732 dmu_tx_t *tx;
2728 2733 vattr_t oldva;
2729 2734 xvattr_t tmpxvattr;
2730 2735 uint_t mask = vap->va_mask;
2731 2736 uint_t saved_mask = 0;
2732 2737 int trim_mask = 0;
2733 2738 uint64_t new_mode;
2734 2739 uint64_t new_uid, new_gid;
2735 2740 uint64_t xattr_obj;
2736 2741 uint64_t mtime[2], ctime[2];
2737 2742 znode_t *attrzp;
2738 2743 int need_policy = FALSE;
2739 2744 int err, err2;
2740 2745 zfs_fuid_info_t *fuidp = NULL;
2741 2746 xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */
2742 2747 xoptattr_t *xoap;
2743 2748 zfs_acl_t *aclp;
2744 2749 boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
2745 2750 boolean_t fuid_dirtied = B_FALSE;
2746 2751 sa_bulk_attr_t bulk[7], xattr_bulk[7];
2747 2752 int count = 0, xattr_count = 0;
2748 2753
2749 2754 if (mask == 0)
2750 2755 return (0);
2751 2756
2752 2757 if (mask & AT_NOSET)
2753 2758 return (SET_ERROR(EINVAL));
2754 2759
2755 2760 ZFS_ENTER(zfsvfs);
2756 2761 ZFS_VERIFY_ZP(zp);
2757 2762
2758 2763 zilog = zfsvfs->z_log;
2759 2764
2760 2765 /*
2761 2766 * Make sure that if we have ephemeral uid/gid or xvattr specified
2762 2767 * that file system is at proper version level
2763 2768 */
2764 2769
2765 2770 if (zfsvfs->z_use_fuids == B_FALSE &&
2766 2771 (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
2767 2772 ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) ||
2768 2773 (mask & AT_XVATTR))) {
2769 2774 ZFS_EXIT(zfsvfs);
2770 2775 return (SET_ERROR(EINVAL));
2771 2776 }
2772 2777
2773 2778 if (mask & AT_SIZE && vp->v_type == VDIR) {
2774 2779 ZFS_EXIT(zfsvfs);
2775 2780 return (SET_ERROR(EISDIR));
2776 2781 }
2777 2782
2778 2783 if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) {
2779 2784 ZFS_EXIT(zfsvfs);
2780 2785 return (SET_ERROR(EINVAL));
2781 2786 }
2782 2787
2783 2788 /*
2784 2789 * If this is an xvattr_t, then get a pointer to the structure of
2785 2790 * optional attributes. If this is NULL, then we have a vattr_t.
2786 2791 */
2787 2792 xoap = xva_getxoptattr(xvap);
2788 2793
2789 2794 xva_init(&tmpxvattr);
2790 2795
2791 2796 /*
2792 2797 * Immutable files can only alter immutable bit and atime
2793 2798 */
2794 2799 if ((zp->z_pflags & ZFS_IMMUTABLE) &&
2795 2800 ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) ||
2796 2801 ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) {
2797 2802 ZFS_EXIT(zfsvfs);
2798 2803 return (SET_ERROR(EPERM));
2799 2804 }
2800 2805
2801 2806 /*
2802 2807 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
2803 2808 */
2804 2809
2805 2810 /*
2806 2811 * Verify timestamps doesn't overflow 32 bits.
2807 2812 * ZFS can handle large timestamps, but 32bit syscalls can't
2808 2813 * handle times greater than 2039. This check should be removed
2809 2814 * once large timestamps are fully supported.
2810 2815 */
2811 2816 if (mask & (AT_ATIME | AT_MTIME)) {
2812 2817 if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) ||
2813 2818 ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) {
2814 2819 ZFS_EXIT(zfsvfs);
2815 2820 return (SET_ERROR(EOVERFLOW));
2816 2821 }
2817 2822 }
2818 2823
2819 2824 top:
2820 2825 attrzp = NULL;
2821 2826 aclp = NULL;
2822 2827
2823 2828 /* Can this be moved to before the top label? */
2824 2829 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
2825 2830 ZFS_EXIT(zfsvfs);
2826 2831 return (SET_ERROR(EROFS));
2827 2832 }
2828 2833
2829 2834 /*
2830 2835 * First validate permissions
2831 2836 */
2832 2837
2833 2838 if (mask & AT_SIZE) {
2834 2839 err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr);
2835 2840 if (err) {
2836 2841 ZFS_EXIT(zfsvfs);
2837 2842 return (err);
2838 2843 }
2839 2844 /*
2840 2845 * XXX - Note, we are not providing any open
2841 2846 * mode flags here (like FNDELAY), so we may
2842 2847 * block if there are locks present... this
2843 2848 * should be addressed in openat().
2844 2849 */
2845 2850 /* XXX - would it be OK to generate a log record here? */
2846 2851 err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE);
2847 2852 if (err) {
2848 2853 ZFS_EXIT(zfsvfs);
2849 2854 return (err);
2850 2855 }
2851 2856
2852 2857 if (vap->va_size == 0)
2853 2858 vnevent_truncate(ZTOV(zp), ct);
2854 2859 }
2855 2860
2856 2861 if (mask & (AT_ATIME|AT_MTIME) ||
2857 2862 ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) ||
2858 2863 XVA_ISSET_REQ(xvap, XAT_READONLY) ||
2859 2864 XVA_ISSET_REQ(xvap, XAT_ARCHIVE) ||
2860 2865 XVA_ISSET_REQ(xvap, XAT_OFFLINE) ||
2861 2866 XVA_ISSET_REQ(xvap, XAT_SPARSE) ||
2862 2867 XVA_ISSET_REQ(xvap, XAT_CREATETIME) ||
2863 2868 XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) {
2864 2869 need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0,
2865 2870 skipaclchk, cr);
2866 2871 }
2867 2872
2868 2873 if (mask & (AT_UID|AT_GID)) {
2869 2874 int idmask = (mask & (AT_UID|AT_GID));
2870 2875 int take_owner;
2871 2876 int take_group;
2872 2877
2873 2878 /*
2874 2879 * NOTE: even if a new mode is being set,
2875 2880 * we may clear S_ISUID/S_ISGID bits.
2876 2881 */
2877 2882
2878 2883 if (!(mask & AT_MODE))
2879 2884 vap->va_mode = zp->z_mode;
2880 2885
2881 2886 /*
2882 2887 * Take ownership or chgrp to group we are a member of
2883 2888 */
2884 2889
2885 2890 take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr));
2886 2891 take_group = (mask & AT_GID) &&
2887 2892 zfs_groupmember(zfsvfs, vap->va_gid, cr);
2888 2893
2889 2894 /*
2890 2895 * If both AT_UID and AT_GID are set then take_owner and
2891 2896 * take_group must both be set in order to allow taking
2892 2897 * ownership.
2893 2898 *
2894 2899 * Otherwise, send the check through secpolicy_vnode_setattr()
2895 2900 *
2896 2901 */
2897 2902
2898 2903 if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) ||
2899 2904 ((idmask == AT_UID) && take_owner) ||
2900 2905 ((idmask == AT_GID) && take_group)) {
2901 2906 if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0,
2902 2907 skipaclchk, cr) == 0) {
2903 2908 /*
2904 2909 * Remove setuid/setgid for non-privileged users
2905 2910 */
2906 2911 secpolicy_setid_clear(vap, cr);
2907 2912 trim_mask = (mask & (AT_UID|AT_GID));
2908 2913 } else {
2909 2914 need_policy = TRUE;
2910 2915 }
2911 2916 } else {
2912 2917 need_policy = TRUE;
2913 2918 }
2914 2919 }
2915 2920
2916 2921 mutex_enter(&zp->z_lock);
2917 2922 oldva.va_mode = zp->z_mode;
2918 2923 zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
2919 2924 if (mask & AT_XVATTR) {
2920 2925 /*
2921 2926 * Update xvattr mask to include only those attributes
2922 2927 * that are actually changing.
2923 2928 *
2924 2929 * the bits will be restored prior to actually setting
2925 2930 * the attributes so the caller thinks they were set.
2926 2931 */
2927 2932 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
2928 2933 if (xoap->xoa_appendonly !=
2929 2934 ((zp->z_pflags & ZFS_APPENDONLY) != 0)) {
2930 2935 need_policy = TRUE;
2931 2936 } else {
2932 2937 XVA_CLR_REQ(xvap, XAT_APPENDONLY);
2933 2938 XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
2934 2939 }
2935 2940 }
2936 2941
2937 2942 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
2938 2943 if (xoap->xoa_nounlink !=
2939 2944 ((zp->z_pflags & ZFS_NOUNLINK) != 0)) {
2940 2945 need_policy = TRUE;
2941 2946 } else {
2942 2947 XVA_CLR_REQ(xvap, XAT_NOUNLINK);
2943 2948 XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
2944 2949 }
2945 2950 }
2946 2951
2947 2952 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
2948 2953 if (xoap->xoa_immutable !=
2949 2954 ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) {
2950 2955 need_policy = TRUE;
2951 2956 } else {
2952 2957 XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
2953 2958 XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
2954 2959 }
2955 2960 }
2956 2961
2957 2962 if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
2958 2963 if (xoap->xoa_nodump !=
2959 2964 ((zp->z_pflags & ZFS_NODUMP) != 0)) {
2960 2965 need_policy = TRUE;
2961 2966 } else {
2962 2967 XVA_CLR_REQ(xvap, XAT_NODUMP);
2963 2968 XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
2964 2969 }
2965 2970 }
2966 2971
2967 2972 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
2968 2973 if (xoap->xoa_av_modified !=
2969 2974 ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) {
2970 2975 need_policy = TRUE;
2971 2976 } else {
2972 2977 XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
2973 2978 XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
2974 2979 }
2975 2980 }
2976 2981
2977 2982 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
2978 2983 if ((vp->v_type != VREG &&
2979 2984 xoap->xoa_av_quarantined) ||
2980 2985 xoap->xoa_av_quarantined !=
2981 2986 ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) {
2982 2987 need_policy = TRUE;
2983 2988 } else {
2984 2989 XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
2985 2990 XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
2986 2991 }
2987 2992 }
2988 2993
2989 2994 if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
2990 2995 mutex_exit(&zp->z_lock);
2991 2996 ZFS_EXIT(zfsvfs);
2992 2997 return (SET_ERROR(EPERM));
2993 2998 }
2994 2999
2995 3000 if (need_policy == FALSE &&
2996 3001 (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
2997 3002 XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
2998 3003 need_policy = TRUE;
2999 3004 }
3000 3005 }
3001 3006
3002 3007 mutex_exit(&zp->z_lock);
3003 3008
3004 3009 if (mask & AT_MODE) {
3005 3010 if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
3006 3011 err = secpolicy_setid_setsticky_clear(vp, vap,
3007 3012 &oldva, cr);
3008 3013 if (err) {
3009 3014 ZFS_EXIT(zfsvfs);
3010 3015 return (err);
3011 3016 }
3012 3017 trim_mask |= AT_MODE;
3013 3018 } else {
3014 3019 need_policy = TRUE;
3015 3020 }
3016 3021 }
3017 3022
3018 3023 if (need_policy) {
3019 3024 /*
3020 3025 * If trim_mask is set then take ownership
3021 3026 * has been granted or write_acl is present and user
3022 3027 * has the ability to modify mode. In that case remove
3023 3028 * UID|GID and or MODE from mask so that
3024 3029 * secpolicy_vnode_setattr() doesn't revoke it.
3025 3030 */
3026 3031
3027 3032 if (trim_mask) {
3028 3033 saved_mask = vap->va_mask;
3029 3034 vap->va_mask &= ~trim_mask;
3030 3035 }
3031 3036 err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags,
3032 3037 (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp);
3033 3038 if (err) {
3034 3039 ZFS_EXIT(zfsvfs);
3035 3040 return (err);
3036 3041 }
3037 3042
3038 3043 if (trim_mask)
3039 3044 vap->va_mask |= saved_mask;
3040 3045 }
3041 3046
3042 3047 /*
3043 3048 * secpolicy_vnode_setattr, or take ownership may have
3044 3049 * changed va_mask
3045 3050 */
3046 3051 mask = vap->va_mask;
3047 3052
3048 3053 if ((mask & (AT_UID | AT_GID))) {
3049 3054 err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
3050 3055 &xattr_obj, sizeof (xattr_obj));
3051 3056
3052 3057 if (err == 0 && xattr_obj) {
3053 3058 err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp);
3054 3059 if (err)
3055 3060 goto out2;
3056 3061 }
3057 3062 if (mask & AT_UID) {
3058 3063 new_uid = zfs_fuid_create(zfsvfs,
3059 3064 (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
3060 3065 if (new_uid != zp->z_uid &&
3061 3066 zfs_fuid_overquota(zfsvfs, B_FALSE, new_uid)) {
3062 3067 if (attrzp)
3063 3068 VN_RELE(ZTOV(attrzp));
3064 3069 err = SET_ERROR(EDQUOT);
3065 3070 goto out2;
3066 3071 }
3067 3072 }
3068 3073
3069 3074 if (mask & AT_GID) {
3070 3075 new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
3071 3076 cr, ZFS_GROUP, &fuidp);
3072 3077 if (new_gid != zp->z_gid &&
3073 3078 zfs_fuid_overquota(zfsvfs, B_TRUE, new_gid)) {
3074 3079 if (attrzp)
3075 3080 VN_RELE(ZTOV(attrzp));
3076 3081 err = SET_ERROR(EDQUOT);
3077 3082 goto out2;
3078 3083 }
3079 3084 }
3080 3085 }
3081 3086 tx = dmu_tx_create(zfsvfs->z_os);
3082 3087
3083 3088 if (mask & AT_MODE) {
3084 3089 uint64_t pmode = zp->z_mode;
3085 3090 uint64_t acl_obj;
3086 3091 new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
3087 3092
3088 3093 if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED &&
3089 3094 !(zp->z_pflags & ZFS_ACL_TRIVIAL)) {
3090 3095 err = SET_ERROR(EPERM);
3091 3096 goto out;
3092 3097 }
3093 3098
3094 3099 if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
3095 3100 goto out;
3096 3101
3097 3102 mutex_enter(&zp->z_lock);
3098 3103 if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) {
3099 3104 /*
3100 3105 * Are we upgrading ACL from old V0 format
3101 3106 * to V1 format?
3102 3107 */
3103 3108 if (zfsvfs->z_version >= ZPL_VERSION_FUID &&
3104 3109 zfs_znode_acl_version(zp) ==
3105 3110 ZFS_ACL_VERSION_INITIAL) {
3106 3111 dmu_tx_hold_free(tx, acl_obj, 0,
3107 3112 DMU_OBJECT_END);
3108 3113 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3109 3114 0, aclp->z_acl_bytes);
3110 3115 } else {
3111 3116 dmu_tx_hold_write(tx, acl_obj, 0,
3112 3117 aclp->z_acl_bytes);
3113 3118 }
3114 3119 } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3115 3120 dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
3116 3121 0, aclp->z_acl_bytes);
3117 3122 }
3118 3123 mutex_exit(&zp->z_lock);
3119 3124 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3120 3125 } else {
3121 3126 if ((mask & AT_XVATTR) &&
3122 3127 XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3123 3128 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
3124 3129 else
3125 3130 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
3126 3131 }
3127 3132
3128 3133 if (attrzp) {
3129 3134 dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE);
3130 3135 }
3131 3136
3132 3137 fuid_dirtied = zfsvfs->z_fuid_dirty;
3133 3138 if (fuid_dirtied)
3134 3139 zfs_fuid_txhold(zfsvfs, tx);
3135 3140
3136 3141 zfs_sa_upgrade_txholds(tx, zp);
3137 3142
3138 3143 err = dmu_tx_assign(tx, TXG_WAIT);
3139 3144 if (err)
3140 3145 goto out;
3141 3146
3142 3147 count = 0;
3143 3148 /*
3144 3149 * Set each attribute requested.
3145 3150 * We group settings according to the locks they need to acquire.
3146 3151 *
3147 3152 * Note: you cannot set ctime directly, although it will be
3148 3153 * updated as a side-effect of calling this function.
3149 3154 */
3150 3155
3151 3156
3152 3157 if (mask & (AT_UID|AT_GID|AT_MODE))
3153 3158 mutex_enter(&zp->z_acl_lock);
3154 3159 mutex_enter(&zp->z_lock);
3155 3160
3156 3161 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
3157 3162 &zp->z_pflags, sizeof (zp->z_pflags));
3158 3163
3159 3164 if (attrzp) {
3160 3165 if (mask & (AT_UID|AT_GID|AT_MODE))
3161 3166 mutex_enter(&attrzp->z_acl_lock);
3162 3167 mutex_enter(&attrzp->z_lock);
3163 3168 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3164 3169 SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags,
3165 3170 sizeof (attrzp->z_pflags));
3166 3171 }
3167 3172
3168 3173 if (mask & (AT_UID|AT_GID)) {
3169 3174
3170 3175 if (mask & AT_UID) {
3171 3176 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL,
3172 3177 &new_uid, sizeof (new_uid));
3173 3178 zp->z_uid = new_uid;
3174 3179 if (attrzp) {
3175 3180 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3176 3181 SA_ZPL_UID(zfsvfs), NULL, &new_uid,
3177 3182 sizeof (new_uid));
3178 3183 attrzp->z_uid = new_uid;
3179 3184 }
3180 3185 }
3181 3186
3182 3187 if (mask & AT_GID) {
3183 3188 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs),
3184 3189 NULL, &new_gid, sizeof (new_gid));
3185 3190 zp->z_gid = new_gid;
3186 3191 if (attrzp) {
3187 3192 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3188 3193 SA_ZPL_GID(zfsvfs), NULL, &new_gid,
3189 3194 sizeof (new_gid));
3190 3195 attrzp->z_gid = new_gid;
3191 3196 }
3192 3197 }
3193 3198 if (!(mask & AT_MODE)) {
3194 3199 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs),
3195 3200 NULL, &new_mode, sizeof (new_mode));
3196 3201 new_mode = zp->z_mode;
3197 3202 }
3198 3203 err = zfs_acl_chown_setattr(zp);
3199 3204 ASSERT(err == 0);
3200 3205 if (attrzp) {
3201 3206 err = zfs_acl_chown_setattr(attrzp);
3202 3207 ASSERT(err == 0);
3203 3208 }
3204 3209 }
3205 3210
3206 3211 if (mask & AT_MODE) {
3207 3212 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL,
3208 3213 &new_mode, sizeof (new_mode));
3209 3214 zp->z_mode = new_mode;
3210 3215 ASSERT3U((uintptr_t)aclp, !=, NULL);
3211 3216 err = zfs_aclset_common(zp, aclp, cr, tx);
3212 3217 ASSERT0(err);
3213 3218 if (zp->z_acl_cached)
3214 3219 zfs_acl_free(zp->z_acl_cached);
3215 3220 zp->z_acl_cached = aclp;
3216 3221 aclp = NULL;
3217 3222 }
3218 3223
3219 3224
3220 3225 if (mask & AT_ATIME) {
3221 3226 ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime);
3222 3227 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL,
3223 3228 &zp->z_atime, sizeof (zp->z_atime));
3224 3229 }
3225 3230
3226 3231 if (mask & AT_MTIME) {
3227 3232 ZFS_TIME_ENCODE(&vap->va_mtime, mtime);
3228 3233 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
3229 3234 mtime, sizeof (mtime));
3230 3235 }
3231 3236
3232 3237 /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */
3233 3238 if (mask & AT_SIZE && !(mask & AT_MTIME)) {
3234 3239 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
3235 3240 NULL, mtime, sizeof (mtime));
3236 3241 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3237 3242 &ctime, sizeof (ctime));
3238 3243 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
3239 3244 B_TRUE);
3240 3245 } else if (mask != 0) {
3241 3246 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
3242 3247 &ctime, sizeof (ctime));
3243 3248 zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime,
3244 3249 B_TRUE);
3245 3250 if (attrzp) {
3246 3251 SA_ADD_BULK_ATTR(xattr_bulk, xattr_count,
3247 3252 SA_ZPL_CTIME(zfsvfs), NULL,
3248 3253 &ctime, sizeof (ctime));
3249 3254 zfs_tstamp_update_setup(attrzp, STATE_CHANGED,
3250 3255 mtime, ctime, B_TRUE);
3251 3256 }
3252 3257 }
3253 3258 /*
3254 3259 * Do this after setting timestamps to prevent timestamp
3255 3260 * update from toggling bit
3256 3261 */
3257 3262
3258 3263 if (xoap && (mask & AT_XVATTR)) {
3259 3264
3260 3265 /*
3261 3266 * restore trimmed off masks
3262 3267 * so that return masks can be set for caller.
3263 3268 */
3264 3269
3265 3270 if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
3266 3271 XVA_SET_REQ(xvap, XAT_APPENDONLY);
3267 3272 }
3268 3273 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
3269 3274 XVA_SET_REQ(xvap, XAT_NOUNLINK);
3270 3275 }
3271 3276 if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
3272 3277 XVA_SET_REQ(xvap, XAT_IMMUTABLE);
3273 3278 }
3274 3279 if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
3275 3280 XVA_SET_REQ(xvap, XAT_NODUMP);
3276 3281 }
3277 3282 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
3278 3283 XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
3279 3284 }
3280 3285 if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
3281 3286 XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
3282 3287 }
3283 3288
3284 3289 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
3285 3290 ASSERT(vp->v_type == VREG);
3286 3291
3287 3292 zfs_xvattr_set(zp, xvap, tx);
3288 3293 }
3289 3294
3290 3295 if (fuid_dirtied)
3291 3296 zfs_fuid_sync(zfsvfs, tx);
3292 3297
3293 3298 if (mask != 0)
3294 3299 zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
3295 3300
3296 3301 mutex_exit(&zp->z_lock);
3297 3302 if (mask & (AT_UID|AT_GID|AT_MODE))
3298 3303 mutex_exit(&zp->z_acl_lock);
3299 3304
3300 3305 if (attrzp) {
3301 3306 if (mask & (AT_UID|AT_GID|AT_MODE))
3302 3307 mutex_exit(&attrzp->z_acl_lock);
3303 3308 mutex_exit(&attrzp->z_lock);
3304 3309 }
3305 3310 out:
3306 3311 if (err == 0 && attrzp) {
3307 3312 err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk,
3308 3313 xattr_count, tx);
3309 3314 ASSERT(err2 == 0);
3310 3315 }
3311 3316
3312 3317 if (attrzp)
3313 3318 VN_RELE(ZTOV(attrzp));
3314 3319
3315 3320 if (aclp)
3316 3321 zfs_acl_free(aclp);
3317 3322
3318 3323 if (fuidp) {
3319 3324 zfs_fuid_info_free(fuidp);
3320 3325 fuidp = NULL;
3321 3326 }
3322 3327
3323 3328 if (err) {
3324 3329 dmu_tx_abort(tx);
3325 3330 if (err == ERESTART)
3326 3331 goto top;
3327 3332 } else {
3328 3333 err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
3329 3334 dmu_tx_commit(tx);
3330 3335 }
3331 3336
3332 3337 out2:
3333 3338 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3334 3339 zil_commit(zilog, 0);
3335 3340
3336 3341 ZFS_EXIT(zfsvfs);
3337 3342 return (err);
3338 3343 }
3339 3344
3340 3345 typedef struct zfs_zlock {
3341 3346 krwlock_t *zl_rwlock; /* lock we acquired */
3342 3347 znode_t *zl_znode; /* znode we held */
3343 3348 struct zfs_zlock *zl_next; /* next in list */
3344 3349 } zfs_zlock_t;
3345 3350
3346 3351 /*
3347 3352 * Drop locks and release vnodes that were held by zfs_rename_lock().
3348 3353 */
3349 3354 static void
3350 3355 zfs_rename_unlock(zfs_zlock_t **zlpp)
3351 3356 {
3352 3357 zfs_zlock_t *zl;
3353 3358
3354 3359 while ((zl = *zlpp) != NULL) {
3355 3360 if (zl->zl_znode != NULL)
3356 3361 VN_RELE(ZTOV(zl->zl_znode));
3357 3362 rw_exit(zl->zl_rwlock);
3358 3363 *zlpp = zl->zl_next;
3359 3364 kmem_free(zl, sizeof (*zl));
3360 3365 }
3361 3366 }
3362 3367
3363 3368 /*
3364 3369 * Search back through the directory tree, using the ".." entries.
3365 3370 * Lock each directory in the chain to prevent concurrent renames.
3366 3371 * Fail any attempt to move a directory into one of its own descendants.
3367 3372 * XXX - z_parent_lock can overlap with map or grow locks
3368 3373 */
3369 3374 static int
3370 3375 zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp)
3371 3376 {
3372 3377 zfs_zlock_t *zl;
3373 3378 znode_t *zp = tdzp;
3374 3379 uint64_t rootid = zp->z_zfsvfs->z_root;
3375 3380 uint64_t oidp = zp->z_id;
3376 3381 krwlock_t *rwlp = &szp->z_parent_lock;
3377 3382 krw_t rw = RW_WRITER;
3378 3383
3379 3384 /*
3380 3385 * First pass write-locks szp and compares to zp->z_id.
3381 3386 * Later passes read-lock zp and compare to zp->z_parent.
3382 3387 */
3383 3388 do {
3384 3389 if (!rw_tryenter(rwlp, rw)) {
3385 3390 /*
3386 3391 * Another thread is renaming in this path.
3387 3392 * Note that if we are a WRITER, we don't have any
3388 3393 * parent_locks held yet.
3389 3394 */
3390 3395 if (rw == RW_READER && zp->z_id > szp->z_id) {
3391 3396 /*
3392 3397 * Drop our locks and restart
3393 3398 */
3394 3399 zfs_rename_unlock(&zl);
3395 3400 *zlpp = NULL;
3396 3401 zp = tdzp;
3397 3402 oidp = zp->z_id;
3398 3403 rwlp = &szp->z_parent_lock;
3399 3404 rw = RW_WRITER;
3400 3405 continue;
3401 3406 } else {
3402 3407 /*
3403 3408 * Wait for other thread to drop its locks
3404 3409 */
3405 3410 rw_enter(rwlp, rw);
3406 3411 }
3407 3412 }
3408 3413
3409 3414 zl = kmem_alloc(sizeof (*zl), KM_SLEEP);
3410 3415 zl->zl_rwlock = rwlp;
3411 3416 zl->zl_znode = NULL;
3412 3417 zl->zl_next = *zlpp;
3413 3418 *zlpp = zl;
3414 3419
3415 3420 if (oidp == szp->z_id) /* We're a descendant of szp */
3416 3421 return (SET_ERROR(EINVAL));
3417 3422
3418 3423 if (oidp == rootid) /* We've hit the top */
3419 3424 return (0);
3420 3425
3421 3426 if (rw == RW_READER) { /* i.e. not the first pass */
3422 3427 int error = zfs_zget(zp->z_zfsvfs, oidp, &zp);
3423 3428 if (error)
3424 3429 return (error);
3425 3430 zl->zl_znode = zp;
3426 3431 }
3427 3432 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zp->z_zfsvfs),
3428 3433 &oidp, sizeof (oidp));
3429 3434 rwlp = &zp->z_parent_lock;
3430 3435 rw = RW_READER;
3431 3436
3432 3437 } while (zp->z_id != sdzp->z_id);
3433 3438
3434 3439 return (0);
3435 3440 }
3436 3441
3437 3442 /*
3438 3443 * Move an entry from the provided source directory to the target
3439 3444 * directory. Change the entry name as indicated.
3440 3445 *
3441 3446 * IN: sdvp - Source directory containing the "old entry".
3442 3447 * snm - Old entry name.
3443 3448 * tdvp - Target directory to contain the "new entry".
3444 3449 * tnm - New entry name.
3445 3450 * cr - credentials of caller.
3446 3451 * ct - caller context
3447 3452 * flags - case flags
3448 3453 *
3449 3454 * RETURN: 0 on success, error code on failure.
3450 3455 *
3451 3456 * Timestamps:
3452 3457 * sdvp,tdvp - ctime|mtime updated
3453 3458 */
3454 3459 /*ARGSUSED*/
3455 3460 static int
3456 3461 zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
3457 3462 caller_context_t *ct, int flags)
3458 3463 {
3459 3464 znode_t *tdzp, *szp, *tzp;
3460 3465 znode_t *sdzp = VTOZ(sdvp);
3461 3466 zfsvfs_t *zfsvfs = sdzp->z_zfsvfs;
3462 3467 zilog_t *zilog;
3463 3468 vnode_t *realvp;
3464 3469 zfs_dirlock_t *sdl, *tdl;
3465 3470 dmu_tx_t *tx;
3466 3471 zfs_zlock_t *zl;
3467 3472 int cmp, serr, terr;
3468 3473 int error = 0, rm_err = 0;
3469 3474 int zflg = 0;
3470 3475 boolean_t waited = B_FALSE;
3471 3476
3472 3477 ZFS_ENTER(zfsvfs);
3473 3478 ZFS_VERIFY_ZP(sdzp);
3474 3479 zilog = zfsvfs->z_log;
3475 3480
3476 3481 /*
3477 3482 * Make sure we have the real vp for the target directory.
3478 3483 */
3479 3484 if (VOP_REALVP(tdvp, &realvp, ct) == 0)
3480 3485 tdvp = realvp;
3481 3486
3482 3487 tdzp = VTOZ(tdvp);
3483 3488 ZFS_VERIFY_ZP(tdzp);
3484 3489
3485 3490 /*
3486 3491 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
3487 3492 * ctldir appear to have the same v_vfsp.
3488 3493 */
3489 3494 if (tdzp->z_zfsvfs != zfsvfs || zfsctl_is_node(tdvp)) {
3490 3495 ZFS_EXIT(zfsvfs);
3491 3496 return (SET_ERROR(EXDEV));
3492 3497 }
3493 3498
3494 3499 if (zfsvfs->z_utf8 && u8_validate(tnm,
3495 3500 strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3496 3501 ZFS_EXIT(zfsvfs);
3497 3502 return (SET_ERROR(EILSEQ));
3498 3503 }
3499 3504
3500 3505 if (flags & FIGNORECASE)
3501 3506 zflg |= ZCILOOK;
3502 3507
3503 3508 top:
3504 3509 szp = NULL;
3505 3510 tzp = NULL;
3506 3511 zl = NULL;
3507 3512
3508 3513 /*
3509 3514 * This is to prevent the creation of links into attribute space
3510 3515 * by renaming a linked file into/outof an attribute directory.
3511 3516 * See the comment in zfs_link() for why this is considered bad.
3512 3517 */
3513 3518 if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) {
3514 3519 ZFS_EXIT(zfsvfs);
3515 3520 return (SET_ERROR(EINVAL));
3516 3521 }
3517 3522
3518 3523 /*
3519 3524 * Lock source and target directory entries. To prevent deadlock,
3520 3525 * a lock ordering must be defined. We lock the directory with
3521 3526 * the smallest object id first, or if it's a tie, the one with
3522 3527 * the lexically first name.
3523 3528 */
3524 3529 if (sdzp->z_id < tdzp->z_id) {
3525 3530 cmp = -1;
3526 3531 } else if (sdzp->z_id > tdzp->z_id) {
3527 3532 cmp = 1;
3528 3533 } else {
3529 3534 /*
3530 3535 * First compare the two name arguments without
3531 3536 * considering any case folding.
3532 3537 */
3533 3538 int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER);
3534 3539
3535 3540 cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error);
3536 3541 ASSERT(error == 0 || !zfsvfs->z_utf8);
3537 3542 if (cmp == 0) {
3538 3543 /*
3539 3544 * POSIX: "If the old argument and the new argument
3540 3545 * both refer to links to the same existing file,
3541 3546 * the rename() function shall return successfully
3542 3547 * and perform no other action."
3543 3548 */
3544 3549 ZFS_EXIT(zfsvfs);
3545 3550 return (0);
3546 3551 }
3547 3552 /*
3548 3553 * If the file system is case-folding, then we may
3549 3554 * have some more checking to do. A case-folding file
3550 3555 * system is either supporting mixed case sensitivity
3551 3556 * access or is completely case-insensitive. Note
3552 3557 * that the file system is always case preserving.
3553 3558 *
3554 3559 * In mixed sensitivity mode case sensitive behavior
3555 3560 * is the default. FIGNORECASE must be used to
3556 3561 * explicitly request case insensitive behavior.
3557 3562 *
3558 3563 * If the source and target names provided differ only
3559 3564 * by case (e.g., a request to rename 'tim' to 'Tim'),
3560 3565 * we will treat this as a special case in the
3561 3566 * case-insensitive mode: as long as the source name
3562 3567 * is an exact match, we will allow this to proceed as
3563 3568 * a name-change request.
3564 3569 */
3565 3570 if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
3566 3571 (zfsvfs->z_case == ZFS_CASE_MIXED &&
3567 3572 flags & FIGNORECASE)) &&
3568 3573 u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST,
3569 3574 &error) == 0) {
3570 3575 /*
3571 3576 * case preserving rename request, require exact
3572 3577 * name matches
3573 3578 */
3574 3579 zflg |= ZCIEXACT;
3575 3580 zflg &= ~ZCILOOK;
3576 3581 }
3577 3582 }
3578 3583
3579 3584 /*
3580 3585 * If the source and destination directories are the same, we should
3581 3586 * grab the z_name_lock of that directory only once.
3582 3587 */
3583 3588 if (sdzp == tdzp) {
3584 3589 zflg |= ZHAVELOCK;
3585 3590 rw_enter(&sdzp->z_name_lock, RW_READER);
3586 3591 }
3587 3592
3588 3593 if (cmp < 0) {
3589 3594 serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
3590 3595 ZEXISTS | zflg, NULL, NULL);
3591 3596 terr = zfs_dirent_lock(&tdl,
3592 3597 tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL);
3593 3598 } else {
3594 3599 terr = zfs_dirent_lock(&tdl,
3595 3600 tdzp, tnm, &tzp, zflg, NULL, NULL);
3596 3601 serr = zfs_dirent_lock(&sdl,
3597 3602 sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg,
3598 3603 NULL, NULL);
3599 3604 }
3600 3605
3601 3606 if (serr) {
3602 3607 /*
3603 3608 * Source entry invalid or not there.
3604 3609 */
3605 3610 if (!terr) {
3606 3611 zfs_dirent_unlock(tdl);
3607 3612 if (tzp)
3608 3613 VN_RELE(ZTOV(tzp));
3609 3614 }
3610 3615
3611 3616 if (sdzp == tdzp)
3612 3617 rw_exit(&sdzp->z_name_lock);
3613 3618
3614 3619 if (strcmp(snm, "..") == 0)
3615 3620 serr = SET_ERROR(EINVAL);
3616 3621 ZFS_EXIT(zfsvfs);
3617 3622 return (serr);
3618 3623 }
3619 3624 if (terr) {
3620 3625 zfs_dirent_unlock(sdl);
3621 3626 VN_RELE(ZTOV(szp));
3622 3627
3623 3628 if (sdzp == tdzp)
3624 3629 rw_exit(&sdzp->z_name_lock);
3625 3630
3626 3631 if (strcmp(tnm, "..") == 0)
3627 3632 terr = SET_ERROR(EINVAL);
3628 3633 ZFS_EXIT(zfsvfs);
3629 3634 return (terr);
3630 3635 }
3631 3636
3632 3637 /*
3633 3638 * Must have write access at the source to remove the old entry
3634 3639 * and write access at the target to create the new entry.
3635 3640 * Note that if target and source are the same, this can be
3636 3641 * done in a single check.
3637 3642 */
3638 3643
3639 3644 if (error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))
3640 3645 goto out;
3641 3646
3642 3647 if (ZTOV(szp)->v_type == VDIR) {
3643 3648 /*
3644 3649 * Check to make sure rename is valid.
3645 3650 * Can't do a move like this: /usr/a/b to /usr/a/b/c/d
3646 3651 */
3647 3652 if (error = zfs_rename_lock(szp, tdzp, sdzp, &zl))
3648 3653 goto out;
3649 3654 }
3650 3655
3651 3656 /*
3652 3657 * Does target exist?
3653 3658 */
3654 3659 if (tzp) {
3655 3660 /*
3656 3661 * Source and target must be the same type.
3657 3662 */
3658 3663 if (ZTOV(szp)->v_type == VDIR) {
3659 3664 if (ZTOV(tzp)->v_type != VDIR) {
3660 3665 error = SET_ERROR(ENOTDIR);
3661 3666 goto out;
3662 3667 }
3663 3668 } else {
3664 3669 if (ZTOV(tzp)->v_type == VDIR) {
3665 3670 error = SET_ERROR(EISDIR);
3666 3671 goto out;
3667 3672 }
3668 3673 }
3669 3674 /*
3670 3675 * POSIX dictates that when the source and target
3671 3676 * entries refer to the same file object, rename
3672 3677 * must do nothing and exit without error.
3673 3678 */
3674 3679 if (szp->z_id == tzp->z_id) {
3675 3680 error = 0;
3676 3681 goto out;
3677 3682 }
3678 3683 }
3679 3684
3680 3685 vnevent_pre_rename_src(ZTOV(szp), sdvp, snm, ct);
3681 3686 if (tzp)
3682 3687 vnevent_pre_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3683 3688
3684 3689 /*
3685 3690 * notify the target directory if it is not the same
3686 3691 * as source directory.
3687 3692 */
3688 3693 if (tdvp != sdvp) {
3689 3694 vnevent_pre_rename_dest_dir(tdvp, ZTOV(szp), tnm, ct);
3690 3695 }
3691 3696
3692 3697 tx = dmu_tx_create(zfsvfs->z_os);
3693 3698 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
3694 3699 dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE);
3695 3700 dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm);
3696 3701 dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm);
3697 3702 if (sdzp != tdzp) {
3698 3703 dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE);
3699 3704 zfs_sa_upgrade_txholds(tx, tdzp);
3700 3705 }
3701 3706 if (tzp) {
3702 3707 dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE);
3703 3708 zfs_sa_upgrade_txholds(tx, tzp);
3704 3709 }
3705 3710
3706 3711 zfs_sa_upgrade_txholds(tx, szp);
3707 3712 dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
3708 3713 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3709 3714 if (error) {
3710 3715 if (zl != NULL)
3711 3716 zfs_rename_unlock(&zl);
3712 3717 zfs_dirent_unlock(sdl);
3713 3718 zfs_dirent_unlock(tdl);
3714 3719
3715 3720 if (sdzp == tdzp)
3716 3721 rw_exit(&sdzp->z_name_lock);
3717 3722
3718 3723 VN_RELE(ZTOV(szp));
3719 3724 if (tzp)
3720 3725 VN_RELE(ZTOV(tzp));
3721 3726 if (error == ERESTART) {
3722 3727 waited = B_TRUE;
3723 3728 dmu_tx_wait(tx);
3724 3729 dmu_tx_abort(tx);
3725 3730 goto top;
3726 3731 }
3727 3732 dmu_tx_abort(tx);
3728 3733 ZFS_EXIT(zfsvfs);
3729 3734 return (error);
3730 3735 }
3731 3736
3732 3737 if (tzp) /* Attempt to remove the existing target */
3733 3738 error = rm_err = zfs_link_destroy(tdl, tzp, tx, zflg, NULL);
3734 3739
3735 3740 if (error == 0) {
3736 3741 error = zfs_link_create(tdl, szp, tx, ZRENAMING);
3737 3742 if (error == 0) {
3738 3743 szp->z_pflags |= ZFS_AV_MODIFIED;
3739 3744
3740 3745 error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs),
3741 3746 (void *)&szp->z_pflags, sizeof (uint64_t), tx);
3742 3747 ASSERT0(error);
3743 3748
3744 3749 error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL);
3745 3750 if (error == 0) {
3746 3751 zfs_log_rename(zilog, tx, TX_RENAME |
3747 3752 (flags & FIGNORECASE ? TX_CI : 0), sdzp,
3748 3753 sdl->dl_name, tdzp, tdl->dl_name, szp);
3749 3754
3750 3755 /*
3751 3756 * Update path information for the target vnode
3752 3757 */
3753 3758 vn_renamepath(tdvp, ZTOV(szp), tnm,
3754 3759 strlen(tnm));
3755 3760 } else {
3756 3761 /*
3757 3762 * At this point, we have successfully created
3758 3763 * the target name, but have failed to remove
3759 3764 * the source name. Since the create was done
3760 3765 * with the ZRENAMING flag, there are
3761 3766 * complications; for one, the link count is
3762 3767 * wrong. The easiest way to deal with this
3763 3768 * is to remove the newly created target, and
3764 3769 * return the original error. This must
3765 3770 * succeed; fortunately, it is very unlikely to
3766 3771 * fail, since we just created it.
3767 3772 */
3768 3773 VERIFY3U(zfs_link_destroy(tdl, szp, tx,
3769 3774 ZRENAMING, NULL), ==, 0);
3770 3775 }
3771 3776 }
3772 3777 }
3773 3778
3774 3779 dmu_tx_commit(tx);
3775 3780
3776 3781 if (tzp && rm_err == 0)
3777 3782 vnevent_rename_dest(ZTOV(tzp), tdvp, tnm, ct);
3778 3783
3779 3784 if (error == 0) {
3780 3785 vnevent_rename_src(ZTOV(szp), sdvp, snm, ct);
3781 3786 /* notify the target dir if it is not the same as source dir */
3782 3787 if (tdvp != sdvp)
3783 3788 vnevent_rename_dest_dir(tdvp, ct);
3784 3789 }
3785 3790 out:
3786 3791 if (zl != NULL)
3787 3792 zfs_rename_unlock(&zl);
3788 3793
3789 3794 zfs_dirent_unlock(sdl);
3790 3795 zfs_dirent_unlock(tdl);
3791 3796
3792 3797 if (sdzp == tdzp)
3793 3798 rw_exit(&sdzp->z_name_lock);
3794 3799
3795 3800
3796 3801 VN_RELE(ZTOV(szp));
3797 3802 if (tzp)
3798 3803 VN_RELE(ZTOV(tzp));
3799 3804
3800 3805 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3801 3806 zil_commit(zilog, 0);
3802 3807
3803 3808 ZFS_EXIT(zfsvfs);
3804 3809 return (error);
3805 3810 }
3806 3811
3807 3812 /*
3808 3813 * Insert the indicated symbolic reference entry into the directory.
3809 3814 *
3810 3815 * IN: dvp - Directory to contain new symbolic link.
3811 3816 * link - Name for new symlink entry.
3812 3817 * vap - Attributes of new entry.
3813 3818 * cr - credentials of caller.
3814 3819 * ct - caller context
3815 3820 * flags - case flags
3816 3821 *
3817 3822 * RETURN: 0 on success, error code on failure.
3818 3823 *
3819 3824 * Timestamps:
3820 3825 * dvp - ctime|mtime updated
3821 3826 */
3822 3827 /*ARGSUSED*/
3823 3828 static int
3824 3829 zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
3825 3830 caller_context_t *ct, int flags)
3826 3831 {
3827 3832 znode_t *zp, *dzp = VTOZ(dvp);
3828 3833 zfs_dirlock_t *dl;
3829 3834 dmu_tx_t *tx;
3830 3835 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
3831 3836 zilog_t *zilog;
3832 3837 uint64_t len = strlen(link);
3833 3838 int error;
3834 3839 int zflg = ZNEW;
3835 3840 zfs_acl_ids_t acl_ids;
3836 3841 boolean_t fuid_dirtied;
3837 3842 uint64_t txtype = TX_SYMLINK;
3838 3843 boolean_t waited = B_FALSE;
3839 3844
3840 3845 ASSERT(vap->va_type == VLNK);
3841 3846
3842 3847 ZFS_ENTER(zfsvfs);
3843 3848 ZFS_VERIFY_ZP(dzp);
3844 3849 zilog = zfsvfs->z_log;
3845 3850
3846 3851 if (zfsvfs->z_utf8 && u8_validate(name, strlen(name),
3847 3852 NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
3848 3853 ZFS_EXIT(zfsvfs);
3849 3854 return (SET_ERROR(EILSEQ));
3850 3855 }
3851 3856 if (flags & FIGNORECASE)
3852 3857 zflg |= ZCILOOK;
3853 3858
3854 3859 if (len > MAXPATHLEN) {
3855 3860 ZFS_EXIT(zfsvfs);
3856 3861 return (SET_ERROR(ENAMETOOLONG));
3857 3862 }
3858 3863
3859 3864 if ((error = zfs_acl_ids_create(dzp, 0,
3860 3865 vap, cr, NULL, &acl_ids)) != 0) {
3861 3866 ZFS_EXIT(zfsvfs);
3862 3867 return (error);
3863 3868 }
3864 3869 top:
3865 3870 /*
3866 3871 * Attempt to lock directory; fail if entry already exists.
3867 3872 */
3868 3873 error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL);
3869 3874 if (error) {
3870 3875 zfs_acl_ids_free(&acl_ids);
3871 3876 ZFS_EXIT(zfsvfs);
3872 3877 return (error);
3873 3878 }
3874 3879
3875 3880 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
3876 3881 zfs_acl_ids_free(&acl_ids);
3877 3882 zfs_dirent_unlock(dl);
3878 3883 ZFS_EXIT(zfsvfs);
3879 3884 return (error);
3880 3885 }
3881 3886
3882 3887 if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
3883 3888 zfs_acl_ids_free(&acl_ids);
3884 3889 zfs_dirent_unlock(dl);
3885 3890 ZFS_EXIT(zfsvfs);
3886 3891 return (SET_ERROR(EDQUOT));
3887 3892 }
3888 3893 tx = dmu_tx_create(zfsvfs->z_os);
3889 3894 fuid_dirtied = zfsvfs->z_fuid_dirty;
3890 3895 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
3891 3896 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
3892 3897 dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
3893 3898 ZFS_SA_BASE_ATTR_SIZE + len);
3894 3899 dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
3895 3900 if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
3896 3901 dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
3897 3902 acl_ids.z_aclp->z_acl_bytes);
3898 3903 }
3899 3904 if (fuid_dirtied)
3900 3905 zfs_fuid_txhold(zfsvfs, tx);
3901 3906 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
3902 3907 if (error) {
3903 3908 zfs_dirent_unlock(dl);
3904 3909 if (error == ERESTART) {
3905 3910 waited = B_TRUE;
3906 3911 dmu_tx_wait(tx);
3907 3912 dmu_tx_abort(tx);
3908 3913 goto top;
3909 3914 }
3910 3915 zfs_acl_ids_free(&acl_ids);
3911 3916 dmu_tx_abort(tx);
3912 3917 ZFS_EXIT(zfsvfs);
3913 3918 return (error);
3914 3919 }
3915 3920
3916 3921 /*
3917 3922 * Create a new object for the symlink.
3918 3923 * for version 4 ZPL datsets the symlink will be an SA attribute
3919 3924 */
3920 3925 zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids);
3921 3926
3922 3927 if (fuid_dirtied)
3923 3928 zfs_fuid_sync(zfsvfs, tx);
3924 3929
3925 3930 mutex_enter(&zp->z_lock);
3926 3931 if (zp->z_is_sa)
3927 3932 error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs),
3928 3933 link, len, tx);
3929 3934 else
3930 3935 zfs_sa_symlink(zp, link, len, tx);
3931 3936 mutex_exit(&zp->z_lock);
3932 3937
3933 3938 zp->z_size = len;
3934 3939 (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs),
3935 3940 &zp->z_size, sizeof (zp->z_size), tx);
3936 3941 /*
3937 3942 * Insert the new object into the directory.
3938 3943 */
3939 3944 (void) zfs_link_create(dl, zp, tx, ZNEW);
3940 3945
3941 3946 if (flags & FIGNORECASE)
3942 3947 txtype |= TX_CI;
3943 3948 zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
3944 3949
3945 3950 zfs_acl_ids_free(&acl_ids);
3946 3951
3947 3952 dmu_tx_commit(tx);
3948 3953
3949 3954 zfs_dirent_unlock(dl);
3950 3955
3951 3956 VN_RELE(ZTOV(zp));
3952 3957
3953 3958 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
3954 3959 zil_commit(zilog, 0);
3955 3960
3956 3961 ZFS_EXIT(zfsvfs);
3957 3962 return (error);
3958 3963 }
3959 3964
3960 3965 /*
3961 3966 * Return, in the buffer contained in the provided uio structure,
3962 3967 * the symbolic path referred to by vp.
3963 3968 *
3964 3969 * IN: vp - vnode of symbolic link.
3965 3970 * uio - structure to contain the link path.
3966 3971 * cr - credentials of caller.
3967 3972 * ct - caller context
3968 3973 *
3969 3974 * OUT: uio - structure containing the link path.
3970 3975 *
3971 3976 * RETURN: 0 on success, error code on failure.
3972 3977 *
3973 3978 * Timestamps:
3974 3979 * vp - atime updated
3975 3980 */
3976 3981 /* ARGSUSED */
3977 3982 static int
3978 3983 zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct)
3979 3984 {
3980 3985 znode_t *zp = VTOZ(vp);
3981 3986 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
3982 3987 int error;
3983 3988
3984 3989 ZFS_ENTER(zfsvfs);
3985 3990 ZFS_VERIFY_ZP(zp);
3986 3991
3987 3992 mutex_enter(&zp->z_lock);
3988 3993 if (zp->z_is_sa)
3989 3994 error = sa_lookup_uio(zp->z_sa_hdl,
3990 3995 SA_ZPL_SYMLINK(zfsvfs), uio);
3991 3996 else
3992 3997 error = zfs_sa_readlink(zp, uio);
3993 3998 mutex_exit(&zp->z_lock);
3994 3999
3995 4000 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
3996 4001
3997 4002 ZFS_EXIT(zfsvfs);
3998 4003 return (error);
3999 4004 }
4000 4005
4001 4006 /*
4002 4007 * Insert a new entry into directory tdvp referencing svp.
4003 4008 *
4004 4009 * IN: tdvp - Directory to contain new entry.
4005 4010 * svp - vnode of new entry.
4006 4011 * name - name of new entry.
4007 4012 * cr - credentials of caller.
4008 4013 * ct - caller context
4009 4014 *
4010 4015 * RETURN: 0 on success, error code on failure.
4011 4016 *
4012 4017 * Timestamps:
4013 4018 * tdvp - ctime|mtime updated
4014 4019 * svp - ctime updated
4015 4020 */
4016 4021 /* ARGSUSED */
4017 4022 static int
4018 4023 zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
4019 4024 caller_context_t *ct, int flags)
4020 4025 {
4021 4026 znode_t *dzp = VTOZ(tdvp);
4022 4027 znode_t *tzp, *szp;
4023 4028 zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
4024 4029 zilog_t *zilog;
4025 4030 zfs_dirlock_t *dl;
4026 4031 dmu_tx_t *tx;
4027 4032 vnode_t *realvp;
4028 4033 int error;
4029 4034 int zf = ZNEW;
4030 4035 uint64_t parent;
4031 4036 uid_t owner;
4032 4037 boolean_t waited = B_FALSE;
4033 4038
4034 4039 ASSERT(tdvp->v_type == VDIR);
4035 4040
4036 4041 ZFS_ENTER(zfsvfs);
4037 4042 ZFS_VERIFY_ZP(dzp);
4038 4043 zilog = zfsvfs->z_log;
4039 4044
4040 4045 if (VOP_REALVP(svp, &realvp, ct) == 0)
4041 4046 svp = realvp;
4042 4047
4043 4048 /*
4044 4049 * POSIX dictates that we return EPERM here.
4045 4050 * Better choices include ENOTSUP or EISDIR.
4046 4051 */
4047 4052 if (svp->v_type == VDIR) {
4048 4053 ZFS_EXIT(zfsvfs);
4049 4054 return (SET_ERROR(EPERM));
4050 4055 }
4051 4056
4052 4057 szp = VTOZ(svp);
4053 4058 ZFS_VERIFY_ZP(szp);
4054 4059
4055 4060 /*
4056 4061 * We check z_zfsvfs rather than v_vfsp here, because snapshots and the
4057 4062 * ctldir appear to have the same v_vfsp.
4058 4063 */
4059 4064 if (szp->z_zfsvfs != zfsvfs || zfsctl_is_node(svp)) {
4060 4065 ZFS_EXIT(zfsvfs);
4061 4066 return (SET_ERROR(EXDEV));
4062 4067 }
4063 4068
4064 4069 /* Prevent links to .zfs/shares files */
4065 4070
4066 4071 if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
4067 4072 &parent, sizeof (uint64_t))) != 0) {
4068 4073 ZFS_EXIT(zfsvfs);
4069 4074 return (error);
4070 4075 }
4071 4076 if (parent == zfsvfs->z_shares_dir) {
4072 4077 ZFS_EXIT(zfsvfs);
4073 4078 return (SET_ERROR(EPERM));
4074 4079 }
4075 4080
4076 4081 if (zfsvfs->z_utf8 && u8_validate(name,
4077 4082 strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) {
4078 4083 ZFS_EXIT(zfsvfs);
4079 4084 return (SET_ERROR(EILSEQ));
4080 4085 }
4081 4086 if (flags & FIGNORECASE)
4082 4087 zf |= ZCILOOK;
4083 4088
4084 4089 /*
4085 4090 * We do not support links between attributes and non-attributes
4086 4091 * because of the potential security risk of creating links
4087 4092 * into "normal" file space in order to circumvent restrictions
4088 4093 * imposed in attribute space.
4089 4094 */
4090 4095 if ((szp->z_pflags & ZFS_XATTR) != (dzp->z_pflags & ZFS_XATTR)) {
4091 4096 ZFS_EXIT(zfsvfs);
4092 4097 return (SET_ERROR(EINVAL));
4093 4098 }
4094 4099
4095 4100
4096 4101 owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER);
4097 4102 if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) {
4098 4103 ZFS_EXIT(zfsvfs);
4099 4104 return (SET_ERROR(EPERM));
4100 4105 }
4101 4106
4102 4107 if (error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr)) {
4103 4108 ZFS_EXIT(zfsvfs);
4104 4109 return (error);
4105 4110 }
4106 4111
4107 4112 top:
4108 4113 /*
4109 4114 * Attempt to lock directory; fail if entry already exists.
4110 4115 */
4111 4116 error = zfs_dirent_lock(&dl, dzp, name, &tzp, zf, NULL, NULL);
4112 4117 if (error) {
4113 4118 ZFS_EXIT(zfsvfs);
4114 4119 return (error);
4115 4120 }
4116 4121
4117 4122 tx = dmu_tx_create(zfsvfs->z_os);
4118 4123 dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE);
4119 4124 dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
4120 4125 zfs_sa_upgrade_txholds(tx, szp);
4121 4126 zfs_sa_upgrade_txholds(tx, dzp);
4122 4127 error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT);
4123 4128 if (error) {
4124 4129 zfs_dirent_unlock(dl);
4125 4130 if (error == ERESTART) {
4126 4131 waited = B_TRUE;
4127 4132 dmu_tx_wait(tx);
4128 4133 dmu_tx_abort(tx);
4129 4134 goto top;
4130 4135 }
4131 4136 dmu_tx_abort(tx);
4132 4137 ZFS_EXIT(zfsvfs);
4133 4138 return (error);
4134 4139 }
4135 4140
4136 4141 error = zfs_link_create(dl, szp, tx, 0);
4137 4142
4138 4143 if (error == 0) {
4139 4144 uint64_t txtype = TX_LINK;
4140 4145 if (flags & FIGNORECASE)
4141 4146 txtype |= TX_CI;
4142 4147 zfs_log_link(zilog, tx, txtype, dzp, szp, name);
4143 4148 }
4144 4149
4145 4150 dmu_tx_commit(tx);
4146 4151
4147 4152 zfs_dirent_unlock(dl);
4148 4153
4149 4154 if (error == 0) {
4150 4155 vnevent_link(svp, ct);
4151 4156 }
4152 4157
4153 4158 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4154 4159 zil_commit(zilog, 0);
4155 4160
4156 4161 ZFS_EXIT(zfsvfs);
4157 4162 return (error);
4158 4163 }
4159 4164
4160 4165 /*
4161 4166 * zfs_null_putapage() is used when the file system has been force
4162 4167 * unmounted. It just drops the pages.
4163 4168 */
4164 4169 /* ARGSUSED */
4165 4170 static int
4166 4171 zfs_null_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4167 4172 size_t *lenp, int flags, cred_t *cr)
4168 4173 {
4169 4174 pvn_write_done(pp, B_INVAL|B_FORCE|B_ERROR);
4170 4175 return (0);
4171 4176 }
4172 4177
4173 4178 /*
4174 4179 * Push a page out to disk, klustering if possible.
4175 4180 *
4176 4181 * IN: vp - file to push page to.
4177 4182 * pp - page to push.
4178 4183 * flags - additional flags.
4179 4184 * cr - credentials of caller.
4180 4185 *
4181 4186 * OUT: offp - start of range pushed.
4182 4187 * lenp - len of range pushed.
4183 4188 *
4184 4189 * RETURN: 0 on success, error code on failure.
4185 4190 *
4186 4191 * NOTE: callers must have locked the page to be pushed. On
4187 4192 * exit, the page (and all other pages in the kluster) must be
4188 4193 * unlocked.
4189 4194 */
4190 4195 /* ARGSUSED */
4191 4196 static int
4192 4197 zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
4193 4198 size_t *lenp, int flags, cred_t *cr)
4194 4199 {
4195 4200 znode_t *zp = VTOZ(vp);
4196 4201 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4197 4202 dmu_tx_t *tx;
4198 4203 u_offset_t off, koff;
4199 4204 size_t len, klen;
4200 4205 int err;
4201 4206
4202 4207 off = pp->p_offset;
4203 4208 len = PAGESIZE;
4204 4209 /*
4205 4210 * If our blocksize is bigger than the page size, try to kluster
4206 4211 * multiple pages so that we write a full block (thus avoiding
4207 4212 * a read-modify-write).
4208 4213 */
4209 4214 if (off < zp->z_size && zp->z_blksz > PAGESIZE) {
4210 4215 klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
4211 4216 koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
4212 4217 ASSERT(koff <= zp->z_size);
4213 4218 if (koff + klen > zp->z_size)
4214 4219 klen = P2ROUNDUP(zp->z_size - koff, (uint64_t)PAGESIZE);
4215 4220 pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
4216 4221 }
4217 4222 ASSERT3U(btop(len), ==, btopr(len));
4218 4223
4219 4224 /*
4220 4225 * Can't push pages past end-of-file.
4221 4226 */
4222 4227 if (off >= zp->z_size) {
4223 4228 /* ignore all pages */
4224 4229 err = 0;
4225 4230 goto out;
4226 4231 } else if (off + len > zp->z_size) {
4227 4232 int npages = btopr(zp->z_size - off);
4228 4233 page_t *trunc;
4229 4234
4230 4235 page_list_break(&pp, &trunc, npages);
4231 4236 /* ignore pages past end of file */
4232 4237 if (trunc)
4233 4238 pvn_write_done(trunc, flags);
4234 4239 len = zp->z_size - off;
4235 4240 }
4236 4241
4237 4242 if (zfs_owner_overquota(zfsvfs, zp, B_FALSE) ||
4238 4243 zfs_owner_overquota(zfsvfs, zp, B_TRUE)) {
4239 4244 err = SET_ERROR(EDQUOT);
4240 4245 goto out;
4241 4246 }
4242 4247 tx = dmu_tx_create(zfsvfs->z_os);
4243 4248 dmu_tx_hold_write(tx, zp->z_id, off, len);
4244 4249
4245 4250 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4246 4251 zfs_sa_upgrade_txholds(tx, zp);
4247 4252 err = dmu_tx_assign(tx, TXG_WAIT);
4248 4253 if (err != 0) {
4249 4254 dmu_tx_abort(tx);
4250 4255 goto out;
4251 4256 }
4252 4257
4253 4258 if (zp->z_blksz <= PAGESIZE) {
4254 4259 caddr_t va = zfs_map_page(pp, S_READ);
4255 4260 ASSERT3U(len, <=, PAGESIZE);
4256 4261 dmu_write(zfsvfs->z_os, zp->z_id, off, len, va, tx);
4257 4262 zfs_unmap_page(pp, va);
4258 4263 } else {
4259 4264 err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, pp, tx);
4260 4265 }
4261 4266
4262 4267 if (err == 0) {
4263 4268 uint64_t mtime[2], ctime[2];
4264 4269 sa_bulk_attr_t bulk[3];
4265 4270 int count = 0;
4266 4271
4267 4272 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
4268 4273 &mtime, 16);
4269 4274 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
4270 4275 &ctime, 16);
4271 4276 SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
4272 4277 &zp->z_pflags, 8);
4273 4278 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
4274 4279 B_TRUE);
4275 4280 err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
4276 4281 ASSERT0(err);
4277 4282 zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
4278 4283 }
4279 4284 dmu_tx_commit(tx);
4280 4285
4281 4286 out:
4282 4287 pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
4283 4288 if (offp)
4284 4289 *offp = off;
4285 4290 if (lenp)
4286 4291 *lenp = len;
4287 4292
4288 4293 return (err);
4289 4294 }
4290 4295
4291 4296 /*
4292 4297 * Copy the portion of the file indicated from pages into the file.
4293 4298 * The pages are stored in a page list attached to the files vnode.
4294 4299 *
4295 4300 * IN: vp - vnode of file to push page data to.
4296 4301 * off - position in file to put data.
4297 4302 * len - amount of data to write.
4298 4303 * flags - flags to control the operation.
4299 4304 * cr - credentials of caller.
4300 4305 * ct - caller context.
4301 4306 *
4302 4307 * RETURN: 0 on success, error code on failure.
4303 4308 *
4304 4309 * Timestamps:
4305 4310 * vp - ctime|mtime updated
4306 4311 */
4307 4312 /*ARGSUSED*/
4308 4313 static int
4309 4314 zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
4310 4315 caller_context_t *ct)
4311 4316 {
4312 4317 znode_t *zp = VTOZ(vp);
4313 4318 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4314 4319 page_t *pp;
4315 4320 size_t io_len;
4316 4321 u_offset_t io_off;
4317 4322 uint_t blksz;
4318 4323 locked_range_t *lr;
4319 4324 int error = 0;
4320 4325
4321 4326 ZFS_ENTER(zfsvfs);
4322 4327 ZFS_VERIFY_ZP(zp);
4323 4328
4324 4329 /*
4325 4330 * There's nothing to do if no data is cached.
4326 4331 */
4327 4332 if (!vn_has_cached_data(vp)) {
4328 4333 ZFS_EXIT(zfsvfs);
4329 4334 return (0);
4330 4335 }
4331 4336
4332 4337 /*
4333 4338 * Align this request to the file block size in case we kluster.
4334 4339 * XXX - this can result in pretty aggresive locking, which can
4335 4340 * impact simultanious read/write access. One option might be
4336 4341 * to break up long requests (len == 0) into block-by-block
4337 4342 * operations to get narrower locking.
4338 4343 */
4339 4344 blksz = zp->z_blksz;
4340 4345 if (ISP2(blksz))
4341 4346 io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
4342 4347 else
4343 4348 io_off = 0;
4344 4349 if (len > 0 && ISP2(blksz))
4345 4350 io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
4346 4351 else
4347 4352 io_len = 0;
4348 4353
4349 4354 if (io_len == 0) {
4350 4355 /*
4351 4356 * Search the entire vp list for pages >= io_off.
4352 4357 */
4353 4358 lr = rangelock_enter(&zp->z_rangelock,
4354 4359 io_off, UINT64_MAX, RL_WRITER);
4355 4360 error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
4356 4361 goto out;
4357 4362 }
4358 4363 lr = rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_WRITER);
4359 4364
4360 4365 if (off > zp->z_size) {
4361 4366 /* past end of file */
4362 4367 rangelock_exit(lr);
4363 4368 ZFS_EXIT(zfsvfs);
4364 4369 return (0);
4365 4370 }
4366 4371
4367 4372 len = MIN(io_len, P2ROUNDUP(zp->z_size, PAGESIZE) - io_off);
4368 4373
4369 4374 for (off = io_off; io_off < off + len; io_off += io_len) {
4370 4375 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
4371 4376 pp = page_lookup(vp, io_off,
4372 4377 (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
4373 4378 } else {
4374 4379 pp = page_lookup_nowait(vp, io_off,
4375 4380 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
4376 4381 }
4377 4382
4378 4383 if (pp != NULL && pvn_getdirty(pp, flags)) {
4379 4384 int err;
4380 4385
4381 4386 /*
4382 4387 * Found a dirty page to push
4383 4388 */
4384 4389 err = zfs_putapage(vp, pp, &io_off, &io_len, flags, cr);
4385 4390 if (err)
4386 4391 error = err;
4387 4392 } else {
4388 4393 io_len = PAGESIZE;
4389 4394 }
4390 4395 }
4391 4396 out:
4392 4397 rangelock_exit(lr);
4393 4398 if ((flags & B_ASYNC) == 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
4394 4399 zil_commit(zfsvfs->z_log, zp->z_id);
4395 4400 ZFS_EXIT(zfsvfs);
4396 4401 return (error);
4397 4402 }
4398 4403
4399 4404 /*ARGSUSED*/
4400 4405 void
4401 4406 zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
4402 4407 {
4403 4408 znode_t *zp = VTOZ(vp);
4404 4409 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4405 4410 int error;
4406 4411
4407 4412 rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER);
4408 4413 if (zp->z_sa_hdl == NULL) {
4409 4414 /*
4410 4415 * The fs has been unmounted, or we did a
4411 4416 * suspend/resume and this file no longer exists.
4412 4417 */
4413 4418 if (vn_has_cached_data(vp)) {
4414 4419 (void) pvn_vplist_dirty(vp, 0, zfs_null_putapage,
4415 4420 B_INVAL, cr);
4416 4421 }
4417 4422
4418 4423 mutex_enter(&zp->z_lock);
4419 4424 mutex_enter(&vp->v_lock);
4420 4425 ASSERT(vp->v_count == 1);
4421 4426 VN_RELE_LOCKED(vp);
4422 4427 mutex_exit(&vp->v_lock);
4423 4428 mutex_exit(&zp->z_lock);
4424 4429 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4425 4430 zfs_znode_free(zp);
4426 4431 return;
4427 4432 }
4428 4433
4429 4434 /*
4430 4435 * Attempt to push any data in the page cache. If this fails
4431 4436 * we will get kicked out later in zfs_zinactive().
4432 4437 */
4433 4438 if (vn_has_cached_data(vp)) {
4434 4439 (void) pvn_vplist_dirty(vp, 0, zfs_putapage, B_INVAL|B_ASYNC,
4435 4440 cr);
4436 4441 }
4437 4442
4438 4443 if (zp->z_atime_dirty && zp->z_unlinked == 0) {
4439 4444 dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os);
4440 4445
4441 4446 dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE);
4442 4447 zfs_sa_upgrade_txholds(tx, zp);
4443 4448 error = dmu_tx_assign(tx, TXG_WAIT);
4444 4449 if (error) {
4445 4450 dmu_tx_abort(tx);
4446 4451 } else {
4447 4452 mutex_enter(&zp->z_lock);
4448 4453 (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs),
4449 4454 (void *)&zp->z_atime, sizeof (zp->z_atime), tx);
4450 4455 zp->z_atime_dirty = 0;
4451 4456 mutex_exit(&zp->z_lock);
4452 4457 dmu_tx_commit(tx);
4453 4458 }
4454 4459 }
4455 4460
4456 4461 zfs_zinactive(zp);
4457 4462 rw_exit(&zfsvfs->z_teardown_inactive_lock);
4458 4463 }
4459 4464
4460 4465 /*
4461 4466 * Bounds-check the seek operation.
4462 4467 *
4463 4468 * IN: vp - vnode seeking within
4464 4469 * ooff - old file offset
4465 4470 * noffp - pointer to new file offset
4466 4471 * ct - caller context
4467 4472 *
4468 4473 * RETURN: 0 on success, EINVAL if new offset invalid.
4469 4474 */
4470 4475 /* ARGSUSED */
4471 4476 static int
4472 4477 zfs_seek(vnode_t *vp, offset_t ooff, offset_t *noffp,
4473 4478 caller_context_t *ct)
4474 4479 {
4475 4480 if (vp->v_type == VDIR)
4476 4481 return (0);
4477 4482 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
4478 4483 }
4479 4484
4480 4485 /*
4481 4486 * Pre-filter the generic locking function to trap attempts to place
4482 4487 * a mandatory lock on a memory mapped file.
4483 4488 */
4484 4489 static int
4485 4490 zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
4486 4491 flk_callback_t *flk_cbp, cred_t *cr, caller_context_t *ct)
4487 4492 {
4488 4493 znode_t *zp = VTOZ(vp);
4489 4494 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4490 4495
4491 4496 ZFS_ENTER(zfsvfs);
4492 4497 ZFS_VERIFY_ZP(zp);
4493 4498
4494 4499 /*
4495 4500 * We are following the UFS semantics with respect to mapcnt
4496 4501 * here: If we see that the file is mapped already, then we will
4497 4502 * return an error, but we don't worry about races between this
4498 4503 * function and zfs_map().
4499 4504 */
4500 4505 if (zp->z_mapcnt > 0 && MANDMODE(zp->z_mode)) {
4501 4506 ZFS_EXIT(zfsvfs);
4502 4507 return (SET_ERROR(EAGAIN));
4503 4508 }
4504 4509 ZFS_EXIT(zfsvfs);
4505 4510 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
4506 4511 }
4507 4512
4508 4513 /*
4509 4514 * If we can't find a page in the cache, we will create a new page
4510 4515 * and fill it with file data. For efficiency, we may try to fill
4511 4516 * multiple pages at once (klustering) to fill up the supplied page
4512 4517 * list. Note that the pages to be filled are held with an exclusive
4513 4518 * lock to prevent access by other threads while they are being filled.
4514 4519 */
4515 4520 static int
4516 4521 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
4517 4522 caddr_t addr, page_t *pl[], size_t plsz, enum seg_rw rw)
4518 4523 {
4519 4524 znode_t *zp = VTOZ(vp);
4520 4525 page_t *pp, *cur_pp;
4521 4526 objset_t *os = zp->z_zfsvfs->z_os;
4522 4527 u_offset_t io_off, total;
4523 4528 size_t io_len;
4524 4529 int err;
4525 4530
4526 4531 if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
4527 4532 /*
4528 4533 * We only have a single page, don't bother klustering
4529 4534 */
4530 4535 io_off = off;
4531 4536 io_len = PAGESIZE;
4532 4537 pp = page_create_va(vp, io_off, io_len,
4533 4538 PG_EXCL | PG_WAIT, seg, addr);
4534 4539 } else {
4535 4540 /*
4536 4541 * Try to find enough pages to fill the page list
4537 4542 */
4538 4543 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
4539 4544 &io_len, off, plsz, 0);
4540 4545 }
4541 4546 if (pp == NULL) {
4542 4547 /*
4543 4548 * The page already exists, nothing to do here.
4544 4549 */
4545 4550 *pl = NULL;
4546 4551 return (0);
4547 4552 }
4548 4553
4549 4554 /*
4550 4555 * Fill the pages in the kluster.
4551 4556 */
4552 4557 cur_pp = pp;
4553 4558 for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
4554 4559 caddr_t va;
4555 4560
4556 4561 ASSERT3U(io_off, ==, cur_pp->p_offset);
4557 4562 va = zfs_map_page(cur_pp, S_WRITE);
4558 4563 err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
4559 4564 DMU_READ_PREFETCH);
4560 4565 zfs_unmap_page(cur_pp, va);
4561 4566 if (err) {
4562 4567 /* On error, toss the entire kluster */
4563 4568 pvn_read_done(pp, B_ERROR);
4564 4569 /* convert checksum errors into IO errors */
4565 4570 if (err == ECKSUM)
4566 4571 err = SET_ERROR(EIO);
4567 4572 return (err);
4568 4573 }
4569 4574 cur_pp = cur_pp->p_next;
4570 4575 }
4571 4576
4572 4577 /*
4573 4578 * Fill in the page list array from the kluster starting
4574 4579 * from the desired offset `off'.
4575 4580 * NOTE: the page list will always be null terminated.
4576 4581 */
4577 4582 pvn_plist_init(pp, pl, plsz, off, io_len, rw);
4578 4583 ASSERT(pl == NULL || (*pl)->p_offset == off);
4579 4584
4580 4585 return (0);
4581 4586 }
4582 4587
4583 4588 /*
4584 4589 * Return pointers to the pages for the file region [off, off + len]
4585 4590 * in the pl array. If plsz is greater than len, this function may
4586 4591 * also return page pointers from after the specified region
4587 4592 * (i.e. the region [off, off + plsz]). These additional pages are
4588 4593 * only returned if they are already in the cache, or were created as
4589 4594 * part of a klustered read.
4590 4595 *
4591 4596 * IN: vp - vnode of file to get data from.
4592 4597 * off - position in file to get data from.
4593 4598 * len - amount of data to retrieve.
4594 4599 * plsz - length of provided page list.
4595 4600 * seg - segment to obtain pages for.
4596 4601 * addr - virtual address of fault.
4597 4602 * rw - mode of created pages.
4598 4603 * cr - credentials of caller.
4599 4604 * ct - caller context.
4600 4605 *
4601 4606 * OUT: protp - protection mode of created pages.
4602 4607 * pl - list of pages created.
4603 4608 *
4604 4609 * RETURN: 0 on success, error code on failure.
4605 4610 *
4606 4611 * Timestamps:
4607 4612 * vp - atime updated
4608 4613 */
4609 4614 /* ARGSUSED */
4610 4615 static int
4611 4616 zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
4612 4617 page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
4613 4618 enum seg_rw rw, cred_t *cr, caller_context_t *ct)
4614 4619 {
4615 4620 znode_t *zp = VTOZ(vp);
4616 4621 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4617 4622 page_t **pl0 = pl;
4618 4623 int err = 0;
4619 4624
4620 4625 /* we do our own caching, faultahead is unnecessary */
4621 4626 if (pl == NULL)
4622 4627 return (0);
4623 4628 else if (len > plsz)
4624 4629 len = plsz;
4625 4630 else
4626 4631 len = P2ROUNDUP(len, PAGESIZE);
4627 4632 ASSERT(plsz >= len);
4628 4633
4629 4634 ZFS_ENTER(zfsvfs);
4630 4635 ZFS_VERIFY_ZP(zp);
4631 4636
4632 4637 if (protp)
4633 4638 *protp = PROT_ALL;
4634 4639
4635 4640 /*
4636 4641 * Loop through the requested range [off, off + len) looking
4637 4642 * for pages. If we don't find a page, we will need to create
4638 4643 * a new page and fill it with data from the file.
4639 4644 */
4640 4645 while (len > 0) {
4641 4646 if (*pl = page_lookup(vp, off, SE_SHARED))
4642 4647 *(pl+1) = NULL;
4643 4648 else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
4644 4649 goto out;
4645 4650 while (*pl) {
4646 4651 ASSERT3U((*pl)->p_offset, ==, off);
4647 4652 off += PAGESIZE;
4648 4653 addr += PAGESIZE;
4649 4654 if (len > 0) {
4650 4655 ASSERT3U(len, >=, PAGESIZE);
4651 4656 len -= PAGESIZE;
4652 4657 }
4653 4658 ASSERT3U(plsz, >=, PAGESIZE);
4654 4659 plsz -= PAGESIZE;
4655 4660 pl++;
4656 4661 }
4657 4662 }
4658 4663
4659 4664 /*
4660 4665 * Fill out the page array with any pages already in the cache.
4661 4666 */
4662 4667 while (plsz > 0 &&
4663 4668 (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
4664 4669 off += PAGESIZE;
4665 4670 plsz -= PAGESIZE;
4666 4671 }
4667 4672 out:
4668 4673 if (err) {
4669 4674 /*
4670 4675 * Release any pages we have previously locked.
4671 4676 */
4672 4677 while (pl > pl0)
4673 4678 page_unlock(*--pl);
4674 4679 } else {
4675 4680 ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
4676 4681 }
4677 4682
4678 4683 *pl = NULL;
4679 4684
4680 4685 ZFS_EXIT(zfsvfs);
4681 4686 return (err);
4682 4687 }
4683 4688
4684 4689 /*
4685 4690 * Request a memory map for a section of a file. This code interacts
4686 4691 * with common code and the VM system as follows:
4687 4692 *
4688 4693 * - common code calls mmap(), which ends up in smmap_common()
4689 4694 * - this calls VOP_MAP(), which takes you into (say) zfs
4690 4695 * - zfs_map() calls as_map(), passing segvn_create() as the callback
4691 4696 * - segvn_create() creates the new segment and calls VOP_ADDMAP()
4692 4697 * - zfs_addmap() updates z_mapcnt
4693 4698 */
4694 4699 /*ARGSUSED*/
4695 4700 static int
4696 4701 zfs_map(vnode_t *vp, offset_t off, struct as *as, caddr_t *addrp,
4697 4702 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4698 4703 caller_context_t *ct)
4699 4704 {
4700 4705 znode_t *zp = VTOZ(vp);
4701 4706 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4702 4707 segvn_crargs_t vn_a;
4703 4708 int error;
4704 4709
4705 4710 ZFS_ENTER(zfsvfs);
4706 4711 ZFS_VERIFY_ZP(zp);
4707 4712
4708 4713 /*
4709 4714 * Note: ZFS_READONLY is handled in zfs_zaccess_common.
4710 4715 */
4711 4716
4712 4717 if ((prot & PROT_WRITE) && (zp->z_pflags &
4713 4718 (ZFS_IMMUTABLE | ZFS_APPENDONLY))) {
4714 4719 ZFS_EXIT(zfsvfs);
4715 4720 return (SET_ERROR(EPERM));
4716 4721 }
4717 4722
4718 4723 if ((prot & (PROT_READ | PROT_EXEC)) &&
4719 4724 (zp->z_pflags & ZFS_AV_QUARANTINED)) {
4720 4725 ZFS_EXIT(zfsvfs);
4721 4726 return (SET_ERROR(EACCES));
4722 4727 }
4723 4728
4724 4729 if (vp->v_flag & VNOMAP) {
4725 4730 ZFS_EXIT(zfsvfs);
4726 4731 return (SET_ERROR(ENOSYS));
4727 4732 }
4728 4733
4729 4734 if (off < 0 || len > MAXOFFSET_T - off) {
4730 4735 ZFS_EXIT(zfsvfs);
4731 4736 return (SET_ERROR(ENXIO));
4732 4737 }
4733 4738
4734 4739 if (vp->v_type != VREG) {
4735 4740 ZFS_EXIT(zfsvfs);
4736 4741 return (SET_ERROR(ENODEV));
4737 4742 }
4738 4743
4739 4744 /*
4740 4745 * If file is locked, disallow mapping.
4741 4746 */
4742 4747 if (MANDMODE(zp->z_mode) && vn_has_flocks(vp)) {
4743 4748 ZFS_EXIT(zfsvfs);
4744 4749 return (SET_ERROR(EAGAIN));
4745 4750 }
4746 4751
4747 4752 as_rangelock(as);
4748 4753 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
4749 4754 if (error != 0) {
4750 4755 as_rangeunlock(as);
4751 4756 ZFS_EXIT(zfsvfs);
4752 4757 return (error);
4753 4758 }
4754 4759
4755 4760 vn_a.vp = vp;
4756 4761 vn_a.offset = (u_offset_t)off;
4757 4762 vn_a.type = flags & MAP_TYPE;
4758 4763 vn_a.prot = prot;
4759 4764 vn_a.maxprot = maxprot;
4760 4765 vn_a.cred = cr;
4761 4766 vn_a.amp = NULL;
4762 4767 vn_a.flags = flags & ~MAP_TYPE;
4763 4768 vn_a.szc = 0;
4764 4769 vn_a.lgrp_mem_policy_flags = 0;
4765 4770
4766 4771 error = as_map(as, *addrp, len, segvn_create, &vn_a);
4767 4772
4768 4773 as_rangeunlock(as);
4769 4774 ZFS_EXIT(zfsvfs);
4770 4775 return (error);
4771 4776 }
4772 4777
4773 4778 /* ARGSUSED */
4774 4779 static int
4775 4780 zfs_addmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4776 4781 size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, cred_t *cr,
4777 4782 caller_context_t *ct)
4778 4783 {
4779 4784 uint64_t pages = btopr(len);
4780 4785
4781 4786 atomic_add_64(&VTOZ(vp)->z_mapcnt, pages);
4782 4787 return (0);
4783 4788 }
4784 4789
4785 4790 /*
4786 4791 * The reason we push dirty pages as part of zfs_delmap() is so that we get a
4787 4792 * more accurate mtime for the associated file. Since we don't have a way of
4788 4793 * detecting when the data was actually modified, we have to resort to
4789 4794 * heuristics. If an explicit msync() is done, then we mark the mtime when the
4790 4795 * last page is pushed. The problem occurs when the msync() call is omitted,
4791 4796 * which by far the most common case:
4792 4797 *
4793 4798 * open()
4794 4799 * mmap()
4795 4800 * <modify memory>
4796 4801 * munmap()
4797 4802 * close()
4798 4803 * <time lapse>
4799 4804 * putpage() via fsflush
4800 4805 *
4801 4806 * If we wait until fsflush to come along, we can have a modification time that
4802 4807 * is some arbitrary point in the future. In order to prevent this in the
4803 4808 * common case, we flush pages whenever a (MAP_SHARED, PROT_WRITE) mapping is
4804 4809 * torn down.
4805 4810 */
4806 4811 /* ARGSUSED */
4807 4812 static int
4808 4813 zfs_delmap(vnode_t *vp, offset_t off, struct as *as, caddr_t addr,
4809 4814 size_t len, uint_t prot, uint_t maxprot, uint_t flags, cred_t *cr,
4810 4815 caller_context_t *ct)
4811 4816 {
4812 4817 uint64_t pages = btopr(len);
4813 4818
4814 4819 ASSERT3U(VTOZ(vp)->z_mapcnt, >=, pages);
4815 4820 atomic_add_64(&VTOZ(vp)->z_mapcnt, -pages);
4816 4821
4817 4822 if ((flags & MAP_SHARED) && (prot & PROT_WRITE) &&
4818 4823 vn_has_cached_data(vp))
4819 4824 (void) VOP_PUTPAGE(vp, off, len, B_ASYNC, cr, ct);
4820 4825
4821 4826 return (0);
4822 4827 }
4823 4828
4824 4829 /*
4825 4830 * Free or allocate space in a file. Currently, this function only
4826 4831 * supports the `F_FREESP' command. However, this command is somewhat
4827 4832 * misnamed, as its functionality includes the ability to allocate as
4828 4833 * well as free space.
4829 4834 *
4830 4835 * IN: vp - vnode of file to free data in.
4831 4836 * cmd - action to take (only F_FREESP supported).
4832 4837 * bfp - section of file to free/alloc.
4833 4838 * flag - current file open mode flags.
4834 4839 * offset - current file offset.
4835 4840 * cr - credentials of caller [UNUSED].
4836 4841 * ct - caller context.
4837 4842 *
4838 4843 * RETURN: 0 on success, error code on failure.
4839 4844 *
4840 4845 * Timestamps:
4841 4846 * vp - ctime|mtime updated
4842 4847 */
4843 4848 /* ARGSUSED */
4844 4849 static int
4845 4850 zfs_space(vnode_t *vp, int cmd, flock64_t *bfp, int flag,
4846 4851 offset_t offset, cred_t *cr, caller_context_t *ct)
4847 4852 {
4848 4853 znode_t *zp = VTOZ(vp);
4849 4854 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4850 4855 uint64_t off, len;
4851 4856 int error;
4852 4857
4853 4858 ZFS_ENTER(zfsvfs);
4854 4859 ZFS_VERIFY_ZP(zp);
4855 4860
4856 4861 if (cmd != F_FREESP) {
4857 4862 ZFS_EXIT(zfsvfs);
4858 4863 return (SET_ERROR(EINVAL));
4859 4864 }
4860 4865
4861 4866 /*
4862 4867 * In a case vp->v_vfsp != zp->z_zfsvfs->z_vfs (e.g. snapshots) our
4863 4868 * callers might not be able to detect properly that we are read-only,
4864 4869 * so check it explicitly here.
4865 4870 */
4866 4871 if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
4867 4872 ZFS_EXIT(zfsvfs);
4868 4873 return (SET_ERROR(EROFS));
4869 4874 }
4870 4875
4871 4876 if (error = convoff(vp, bfp, 0, offset)) {
4872 4877 ZFS_EXIT(zfsvfs);
4873 4878 return (error);
4874 4879 }
4875 4880
4876 4881 if (bfp->l_len < 0) {
4877 4882 ZFS_EXIT(zfsvfs);
4878 4883 return (SET_ERROR(EINVAL));
4879 4884 }
4880 4885
4881 4886 off = bfp->l_start;
4882 4887 len = bfp->l_len; /* 0 means from off to end of file */
4883 4888
4884 4889 error = zfs_freesp(zp, off, len, flag, TRUE);
4885 4890
4886 4891 if (error == 0 && off == 0 && len == 0)
4887 4892 vnevent_truncate(ZTOV(zp), ct);
4888 4893
4889 4894 ZFS_EXIT(zfsvfs);
4890 4895 return (error);
4891 4896 }
4892 4897
4893 4898 /*ARGSUSED*/
4894 4899 static int
4895 4900 zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
4896 4901 {
4897 4902 znode_t *zp = VTOZ(vp);
4898 4903 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
4899 4904 uint32_t gen;
4900 4905 uint64_t gen64;
4901 4906 uint64_t object = zp->z_id;
4902 4907 zfid_short_t *zfid;
4903 4908 int size, i, error;
4904 4909
4905 4910 ZFS_ENTER(zfsvfs);
4906 4911 ZFS_VERIFY_ZP(zp);
4907 4912
4908 4913 if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs),
4909 4914 &gen64, sizeof (uint64_t))) != 0) {
4910 4915 ZFS_EXIT(zfsvfs);
4911 4916 return (error);
4912 4917 }
4913 4918
4914 4919 gen = (uint32_t)gen64;
4915 4920
4916 4921 size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN;
4917 4922 if (fidp->fid_len < size) {
4918 4923 fidp->fid_len = size;
4919 4924 ZFS_EXIT(zfsvfs);
4920 4925 return (SET_ERROR(ENOSPC));
4921 4926 }
4922 4927
4923 4928 zfid = (zfid_short_t *)fidp;
4924 4929
4925 4930 zfid->zf_len = size;
4926 4931
4927 4932 for (i = 0; i < sizeof (zfid->zf_object); i++)
4928 4933 zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
4929 4934
4930 4935 /* Must have a non-zero generation number to distinguish from .zfs */
4931 4936 if (gen == 0)
4932 4937 gen = 1;
4933 4938 for (i = 0; i < sizeof (zfid->zf_gen); i++)
4934 4939 zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
4935 4940
4936 4941 if (size == LONG_FID_LEN) {
4937 4942 uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
4938 4943 zfid_long_t *zlfid;
4939 4944
4940 4945 zlfid = (zfid_long_t *)fidp;
4941 4946
4942 4947 for (i = 0; i < sizeof (zlfid->zf_setid); i++)
4943 4948 zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
4944 4949
4945 4950 /* XXX - this should be the generation number for the objset */
4946 4951 for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
4947 4952 zlfid->zf_setgen[i] = 0;
4948 4953 }
4949 4954
4950 4955 ZFS_EXIT(zfsvfs);
4951 4956 return (0);
4952 4957 }
4953 4958
4954 4959 static int
4955 4960 zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
4956 4961 caller_context_t *ct)
4957 4962 {
4958 4963 znode_t *zp, *xzp;
4959 4964 zfsvfs_t *zfsvfs;
4960 4965 zfs_dirlock_t *dl;
4961 4966 int error;
4962 4967
4963 4968 switch (cmd) {
4964 4969 case _PC_LINK_MAX:
4965 4970 *valp = ULONG_MAX;
4966 4971 return (0);
4967 4972
4968 4973 case _PC_FILESIZEBITS:
4969 4974 *valp = 64;
4970 4975 return (0);
4971 4976
4972 4977 case _PC_XATTR_EXISTS:
4973 4978 zp = VTOZ(vp);
4974 4979 zfsvfs = zp->z_zfsvfs;
4975 4980 ZFS_ENTER(zfsvfs);
4976 4981 ZFS_VERIFY_ZP(zp);
4977 4982 *valp = 0;
4978 4983 error = zfs_dirent_lock(&dl, zp, "", &xzp,
4979 4984 ZXATTR | ZEXISTS | ZSHARED, NULL, NULL);
4980 4985 if (error == 0) {
4981 4986 zfs_dirent_unlock(dl);
4982 4987 if (!zfs_dirempty(xzp))
4983 4988 *valp = 1;
4984 4989 VN_RELE(ZTOV(xzp));
4985 4990 } else if (error == ENOENT) {
4986 4991 /*
4987 4992 * If there aren't extended attributes, it's the
4988 4993 * same as having zero of them.
4989 4994 */
4990 4995 error = 0;
4991 4996 }
4992 4997 ZFS_EXIT(zfsvfs);
4993 4998 return (error);
4994 4999
4995 5000 case _PC_SATTR_ENABLED:
4996 5001 case _PC_SATTR_EXISTS:
4997 5002 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) &&
4998 5003 (vp->v_type == VREG || vp->v_type == VDIR);
4999 5004 return (0);
5000 5005
5001 5006 case _PC_ACCESS_FILTERING:
5002 5007 *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
5003 5008 vp->v_type == VDIR;
5004 5009 return (0);
5005 5010
5006 5011 case _PC_ACL_ENABLED:
5007 5012 *valp = _ACL_ACE_ENABLED;
5008 5013 return (0);
5009 5014
5010 5015 case _PC_MIN_HOLE_SIZE:
5011 5016 *valp = (ulong_t)SPA_MINBLOCKSIZE;
5012 5017 return (0);
5013 5018
5014 5019 case _PC_TIMESTAMP_RESOLUTION:
5015 5020 /* nanosecond timestamp resolution */
5016 5021 *valp = 1L;
5017 5022 return (0);
5018 5023
5019 5024 default:
5020 5025 return (fs_pathconf(vp, cmd, valp, cr, ct));
5021 5026 }
5022 5027 }
5023 5028
5024 5029 /*ARGSUSED*/
5025 5030 static int
5026 5031 zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5027 5032 caller_context_t *ct)
5028 5033 {
5029 5034 znode_t *zp = VTOZ(vp);
5030 5035 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5031 5036 int error;
5032 5037 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5033 5038
5034 5039 ZFS_ENTER(zfsvfs);
5035 5040 ZFS_VERIFY_ZP(zp);
5036 5041 error = zfs_getacl(zp, vsecp, skipaclchk, cr);
5037 5042 ZFS_EXIT(zfsvfs);
5038 5043
5039 5044 return (error);
5040 5045 }
5041 5046
5042 5047 /*ARGSUSED*/
5043 5048 static int
5044 5049 zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
5045 5050 caller_context_t *ct)
5046 5051 {
5047 5052 znode_t *zp = VTOZ(vp);
5048 5053 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5049 5054 int error;
5050 5055 boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
5051 5056 zilog_t *zilog = zfsvfs->z_log;
5052 5057
5053 5058 ZFS_ENTER(zfsvfs);
5054 5059 ZFS_VERIFY_ZP(zp);
5055 5060
5056 5061 error = zfs_setacl(zp, vsecp, skipaclchk, cr);
5057 5062
5058 5063 if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)
5059 5064 zil_commit(zilog, 0);
5060 5065
5061 5066 ZFS_EXIT(zfsvfs);
5062 5067 return (error);
5063 5068 }
5064 5069
5065 5070 /*
5066 5071 * The smallest read we may consider to loan out an arcbuf.
5067 5072 * This must be a power of 2.
5068 5073 */
5069 5074 int zcr_blksz_min = (1 << 10); /* 1K */
5070 5075 /*
5071 5076 * If set to less than the file block size, allow loaning out of an
5072 5077 * arcbuf for a partial block read. This must be a power of 2.
5073 5078 */
5074 5079 int zcr_blksz_max = (1 << 17); /* 128K */
5075 5080
5076 5081 /*ARGSUSED*/
5077 5082 static int
5078 5083 zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
5079 5084 caller_context_t *ct)
5080 5085 {
5081 5086 znode_t *zp = VTOZ(vp);
5082 5087 zfsvfs_t *zfsvfs = zp->z_zfsvfs;
5083 5088 int max_blksz = zfsvfs->z_max_blksz;
5084 5089 uio_t *uio = &xuio->xu_uio;
5085 5090 ssize_t size = uio->uio_resid;
5086 5091 offset_t offset = uio->uio_loffset;
5087 5092 int blksz;
5088 5093 int fullblk, i;
5089 5094 arc_buf_t *abuf;
5090 5095 ssize_t maxsize;
5091 5096 int preamble, postamble;
5092 5097
5093 5098 if (xuio->xu_type != UIOTYPE_ZEROCOPY)
5094 5099 return (SET_ERROR(EINVAL));
5095 5100
5096 5101 ZFS_ENTER(zfsvfs);
5097 5102 ZFS_VERIFY_ZP(zp);
5098 5103 switch (ioflag) {
5099 5104 case UIO_WRITE:
5100 5105 /*
5101 5106 * Loan out an arc_buf for write if write size is bigger than
5102 5107 * max_blksz, and the file's block size is also max_blksz.
5103 5108 */
5104 5109 blksz = max_blksz;
5105 5110 if (size < blksz || zp->z_blksz != blksz) {
5106 5111 ZFS_EXIT(zfsvfs);
5107 5112 return (SET_ERROR(EINVAL));
5108 5113 }
5109 5114 /*
5110 5115 * Caller requests buffers for write before knowing where the
5111 5116 * write offset might be (e.g. NFS TCP write).
5112 5117 */
5113 5118 if (offset == -1) {
5114 5119 preamble = 0;
5115 5120 } else {
5116 5121 preamble = P2PHASE(offset, blksz);
5117 5122 if (preamble) {
5118 5123 preamble = blksz - preamble;
5119 5124 size -= preamble;
5120 5125 }
5121 5126 }
5122 5127
5123 5128 postamble = P2PHASE(size, blksz);
5124 5129 size -= postamble;
5125 5130
5126 5131 fullblk = size / blksz;
5127 5132 (void) dmu_xuio_init(xuio,
5128 5133 (preamble != 0) + fullblk + (postamble != 0));
5129 5134 DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
5130 5135 int, postamble, int,
5131 5136 (preamble != 0) + fullblk + (postamble != 0));
5132 5137
5133 5138 /*
5134 5139 * Have to fix iov base/len for partial buffers. They
5135 5140 * currently represent full arc_buf's.
5136 5141 */
5137 5142 if (preamble) {
5138 5143 /* data begins in the middle of the arc_buf */
5139 5144 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5140 5145 blksz);
5141 5146 ASSERT(abuf);
5142 5147 (void) dmu_xuio_add(xuio, abuf,
5143 5148 blksz - preamble, preamble);
5144 5149 }
5145 5150
5146 5151 for (i = 0; i < fullblk; i++) {
5147 5152 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5148 5153 blksz);
5149 5154 ASSERT(abuf);
5150 5155 (void) dmu_xuio_add(xuio, abuf, 0, blksz);
5151 5156 }
5152 5157
5153 5158 if (postamble) {
5154 5159 /* data ends in the middle of the arc_buf */
5155 5160 abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl),
5156 5161 blksz);
5157 5162 ASSERT(abuf);
5158 5163 (void) dmu_xuio_add(xuio, abuf, 0, postamble);
5159 5164 }
5160 5165 break;
5161 5166 case UIO_READ:
5162 5167 /*
5163 5168 * Loan out an arc_buf for read if the read size is larger than
5164 5169 * the current file block size. Block alignment is not
5165 5170 * considered. Partial arc_buf will be loaned out for read.
5166 5171 */
5167 5172 blksz = zp->z_blksz;
5168 5173 if (blksz < zcr_blksz_min)
5169 5174 blksz = zcr_blksz_min;
5170 5175 if (blksz > zcr_blksz_max)
5171 5176 blksz = zcr_blksz_max;
5172 5177 /* avoid potential complexity of dealing with it */
5173 5178 if (blksz > max_blksz) {
5174 5179 ZFS_EXIT(zfsvfs);
5175 5180 return (SET_ERROR(EINVAL));
5176 5181 }
5177 5182
5178 5183 maxsize = zp->z_size - uio->uio_loffset;
5179 5184 if (size > maxsize)
5180 5185 size = maxsize;
5181 5186
5182 5187 if (size < blksz || vn_has_cached_data(vp)) {
5183 5188 ZFS_EXIT(zfsvfs);
5184 5189 return (SET_ERROR(EINVAL));
5185 5190 }
5186 5191 break;
5187 5192 default:
5188 5193 ZFS_EXIT(zfsvfs);
5189 5194 return (SET_ERROR(EINVAL));
5190 5195 }
5191 5196
5192 5197 uio->uio_extflg = UIO_XUIO;
5193 5198 XUIO_XUZC_RW(xuio) = ioflag;
5194 5199 ZFS_EXIT(zfsvfs);
5195 5200 return (0);
5196 5201 }
5197 5202
5198 5203 /*ARGSUSED*/
5199 5204 static int
5200 5205 zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
5201 5206 {
5202 5207 int i;
5203 5208 arc_buf_t *abuf;
5204 5209 int ioflag = XUIO_XUZC_RW(xuio);
5205 5210
5206 5211 ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
5207 5212
5208 5213 i = dmu_xuio_cnt(xuio);
5209 5214 while (i-- > 0) {
5210 5215 abuf = dmu_xuio_arcbuf(xuio, i);
5211 5216 /*
5212 5217 * if abuf == NULL, it must be a write buffer
5213 5218 * that has been returned in zfs_write().
5214 5219 */
5215 5220 if (abuf)
5216 5221 dmu_return_arcbuf(abuf);
5217 5222 ASSERT(abuf || ioflag == UIO_WRITE);
5218 5223 }
5219 5224
5220 5225 dmu_xuio_fini(xuio);
5221 5226 return (0);
5222 5227 }
5223 5228
5224 5229 /*
5225 5230 * Predeclare these here so that the compiler assumes that
5226 5231 * this is an "old style" function declaration that does
5227 5232 * not include arguments => we won't get type mismatch errors
5228 5233 * in the initializations that follow.
5229 5234 */
5230 5235 static int zfs_inval();
5231 5236 static int zfs_isdir();
5232 5237
5233 5238 static int
5234 5239 zfs_inval()
5235 5240 {
5236 5241 return (SET_ERROR(EINVAL));
5237 5242 }
5238 5243
5239 5244 static int
5240 5245 zfs_isdir()
5241 5246 {
5242 5247 return (SET_ERROR(EISDIR));
5243 5248 }
5244 5249 /*
5245 5250 * Directory vnode operations template
5246 5251 */
5247 5252 vnodeops_t *zfs_dvnodeops;
5248 5253 const fs_operation_def_t zfs_dvnodeops_template[] = {
5249 5254 VOPNAME_OPEN, { .vop_open = zfs_open },
5250 5255 VOPNAME_CLOSE, { .vop_close = zfs_close },
5251 5256 VOPNAME_READ, { .error = zfs_isdir },
5252 5257 VOPNAME_WRITE, { .error = zfs_isdir },
5253 5258 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5254 5259 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5255 5260 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5256 5261 VOPNAME_ACCESS, { .vop_access = zfs_access },
5257 5262 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5258 5263 VOPNAME_CREATE, { .vop_create = zfs_create },
5259 5264 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
5260 5265 VOPNAME_LINK, { .vop_link = zfs_link },
5261 5266 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5262 5267 VOPNAME_MKDIR, { .vop_mkdir = zfs_mkdir },
5263 5268 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
5264 5269 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
5265 5270 VOPNAME_SYMLINK, { .vop_symlink = zfs_symlink },
5266 5271 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5267 5272 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5268 5273 VOPNAME_FID, { .vop_fid = zfs_fid },
5269 5274 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5270 5275 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5271 5276 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5272 5277 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5273 5278 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5274 5279 NULL, NULL
5275 5280 };
5276 5281
5277 5282 /*
5278 5283 * Regular file vnode operations template
5279 5284 */
5280 5285 vnodeops_t *zfs_fvnodeops;
5281 5286 const fs_operation_def_t zfs_fvnodeops_template[] = {
5282 5287 VOPNAME_OPEN, { .vop_open = zfs_open },
5283 5288 VOPNAME_CLOSE, { .vop_close = zfs_close },
5284 5289 VOPNAME_READ, { .vop_read = zfs_read },
5285 5290 VOPNAME_WRITE, { .vop_write = zfs_write },
5286 5291 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5287 5292 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5288 5293 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5289 5294 VOPNAME_ACCESS, { .vop_access = zfs_access },
5290 5295 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5291 5296 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5292 5297 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5293 5298 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5294 5299 VOPNAME_FID, { .vop_fid = zfs_fid },
5295 5300 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5296 5301 VOPNAME_FRLOCK, { .vop_frlock = zfs_frlock },
5297 5302 VOPNAME_SPACE, { .vop_space = zfs_space },
5298 5303 VOPNAME_GETPAGE, { .vop_getpage = zfs_getpage },
5299 5304 VOPNAME_PUTPAGE, { .vop_putpage = zfs_putpage },
5300 5305 VOPNAME_MAP, { .vop_map = zfs_map },
5301 5306 VOPNAME_ADDMAP, { .vop_addmap = zfs_addmap },
5302 5307 VOPNAME_DELMAP, { .vop_delmap = zfs_delmap },
5303 5308 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5304 5309 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5305 5310 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5306 5311 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5307 5312 VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf },
5308 5313 VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf },
5309 5314 NULL, NULL
5310 5315 };
5311 5316
5312 5317 /*
5313 5318 * Symbolic link vnode operations template
5314 5319 */
5315 5320 vnodeops_t *zfs_symvnodeops;
5316 5321 const fs_operation_def_t zfs_symvnodeops_template[] = {
5317 5322 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5318 5323 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5319 5324 VOPNAME_ACCESS, { .vop_access = zfs_access },
5320 5325 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5321 5326 VOPNAME_READLINK, { .vop_readlink = zfs_readlink },
5322 5327 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5323 5328 VOPNAME_FID, { .vop_fid = zfs_fid },
5324 5329 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5325 5330 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5326 5331 NULL, NULL
5327 5332 };
5328 5333
5329 5334 /*
5330 5335 * special share hidden files vnode operations template
5331 5336 */
5332 5337 vnodeops_t *zfs_sharevnodeops;
5333 5338 const fs_operation_def_t zfs_sharevnodeops_template[] = {
5334 5339 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5335 5340 VOPNAME_ACCESS, { .vop_access = zfs_access },
5336 5341 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5337 5342 VOPNAME_FID, { .vop_fid = zfs_fid },
5338 5343 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5339 5344 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5340 5345 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5341 5346 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5342 5347 NULL, NULL
5343 5348 };
5344 5349
5345 5350 /*
5346 5351 * Extended attribute directory vnode operations template
5347 5352 *
5348 5353 * This template is identical to the directory vnodes
5349 5354 * operation template except for restricted operations:
5350 5355 * VOP_MKDIR()
5351 5356 * VOP_SYMLINK()
5352 5357 *
5353 5358 * Note that there are other restrictions embedded in:
5354 5359 * zfs_create() - restrict type to VREG
5355 5360 * zfs_link() - no links into/out of attribute space
5356 5361 * zfs_rename() - no moves into/out of attribute space
5357 5362 */
5358 5363 vnodeops_t *zfs_xdvnodeops;
5359 5364 const fs_operation_def_t zfs_xdvnodeops_template[] = {
5360 5365 VOPNAME_OPEN, { .vop_open = zfs_open },
5361 5366 VOPNAME_CLOSE, { .vop_close = zfs_close },
5362 5367 VOPNAME_IOCTL, { .vop_ioctl = zfs_ioctl },
5363 5368 VOPNAME_GETATTR, { .vop_getattr = zfs_getattr },
5364 5369 VOPNAME_SETATTR, { .vop_setattr = zfs_setattr },
5365 5370 VOPNAME_ACCESS, { .vop_access = zfs_access },
5366 5371 VOPNAME_LOOKUP, { .vop_lookup = zfs_lookup },
5367 5372 VOPNAME_CREATE, { .vop_create = zfs_create },
5368 5373 VOPNAME_REMOVE, { .vop_remove = zfs_remove },
5369 5374 VOPNAME_LINK, { .vop_link = zfs_link },
5370 5375 VOPNAME_RENAME, { .vop_rename = zfs_rename },
5371 5376 VOPNAME_MKDIR, { .error = zfs_inval },
5372 5377 VOPNAME_RMDIR, { .vop_rmdir = zfs_rmdir },
5373 5378 VOPNAME_READDIR, { .vop_readdir = zfs_readdir },
5374 5379 VOPNAME_SYMLINK, { .error = zfs_inval },
5375 5380 VOPNAME_FSYNC, { .vop_fsync = zfs_fsync },
5376 5381 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5377 5382 VOPNAME_FID, { .vop_fid = zfs_fid },
5378 5383 VOPNAME_SEEK, { .vop_seek = zfs_seek },
5379 5384 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5380 5385 VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr },
5381 5386 VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr },
5382 5387 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
5383 5388 NULL, NULL
5384 5389 };
5385 5390
5386 5391 /*
5387 5392 * Error vnode operations template
5388 5393 */
5389 5394 vnodeops_t *zfs_evnodeops;
5390 5395 const fs_operation_def_t zfs_evnodeops_template[] = {
5391 5396 VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive },
5392 5397 VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf },
5393 5398 NULL, NULL
5394 5399 };
|
↓ open down ↓ |
4394 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX