1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
24 */
25
26 /*
27 * Copyright 2015, Joyent, Inc.
28 */
29
30 #include <sys/types.h>
31 #include <sys/t_lock.h>
32 #include <sys/param.h>
33 #include <sys/time.h>
34 #include <sys/systm.h>
35 #include <sys/sysmacros.h>
36 #include <sys/resource.h>
37 #include <sys/signal.h>
38 #include <sys/cred.h>
39 #include <sys/user.h>
40 #include <sys/buf.h>
41 #include <sys/vfs.h>
42 #include <sys/vfs_opreg.h>
43 #include <sys/stat.h>
44 #include <sys/vnode.h>
45 #include <sys/mode.h>
46 #include <sys/proc.h>
47 #include <sys/disp.h>
48 #include <sys/file.h>
49 #include <sys/fcntl.h>
50 #include <sys/flock.h>
51 #include <sys/kmem.h>
52 #include <sys/uio.h>
53 #include <sys/dnlc.h>
54 #include <sys/conf.h>
55 #include <sys/errno.h>
56 #include <sys/mman.h>
57 #include <sys/fbuf.h>
58 #include <sys/pathname.h>
59 #include <sys/debug.h>
60 #include <sys/vmsystm.h>
61 #include <sys/cmn_err.h>
62 #include <sys/dirent.h>
63 #include <sys/errno.h>
64 #include <sys/modctl.h>
65 #include <sys/statvfs.h>
66 #include <sys/mount.h>
67 #include <sys/sunddi.h>
68 #include <sys/bootconf.h>
69 #include <sys/policy.h>
70
71 #include <vm/hat.h>
72 #include <vm/page.h>
73 #include <vm/pvn.h>
74 #include <vm/as.h>
75 #include <vm/seg.h>
76 #include <vm/seg_map.h>
77 #include <vm/seg_kmem.h>
78 #include <vm/seg_vn.h>
79 #include <vm/rm.h>
80 #include <vm/page.h>
81 #include <sys/swap.h>
82
83 #include <fs/fs_subr.h>
84
85 #include <sys/fs/udf_volume.h>
86 #include <sys/fs/udf_inode.h>
87
88 static int32_t udf_open(struct vnode **,
89 int32_t, struct cred *, caller_context_t *);
90 static int32_t udf_close(struct vnode *,
91 int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
92 static int32_t udf_read(struct vnode *,
93 struct uio *, int32_t, struct cred *, caller_context_t *);
94 static int32_t udf_write(struct vnode *,
95 struct uio *, int32_t, struct cred *, caller_context_t *);
96 static int32_t udf_ioctl(struct vnode *,
97 int32_t, intptr_t, int32_t, struct cred *, int32_t *,
98 caller_context_t *);
99 static int32_t udf_getattr(struct vnode *,
100 struct vattr *, int32_t, struct cred *, caller_context_t *);
101 static int32_t udf_setattr(struct vnode *,
102 struct vattr *, int32_t, struct cred *, caller_context_t *);
103 static int32_t udf_access(struct vnode *,
104 int32_t, int32_t, struct cred *, caller_context_t *);
105 static int32_t udf_lookup(struct vnode *,
106 char *, struct vnode **, struct pathname *,
107 int32_t, struct vnode *, struct cred *,
108 caller_context_t *, int *, pathname_t *);
109 static int32_t udf_create(struct vnode *,
110 char *, struct vattr *, enum vcexcl,
111 int32_t, struct vnode **, struct cred *, int32_t,
112 caller_context_t *, vsecattr_t *);
113 static int32_t udf_remove(struct vnode *,
114 char *, struct cred *, caller_context_t *, int);
115 static int32_t udf_link(struct vnode *,
116 struct vnode *, char *, struct cred *, caller_context_t *, int);
117 static int32_t udf_rename(struct vnode *,
118 char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
119 static int32_t udf_mkdir(struct vnode *,
120 char *, struct vattr *, struct vnode **, struct cred *,
121 caller_context_t *, int, vsecattr_t *);
122 static int32_t udf_rmdir(struct vnode *,
123 char *, struct vnode *, struct cred *, caller_context_t *, int);
124 static int32_t udf_readdir(struct vnode *,
125 struct uio *, struct cred *, int32_t *, caller_context_t *, int);
126 static int32_t udf_symlink(struct vnode *,
127 char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
128 static int32_t udf_readlink(struct vnode *,
129 struct uio *, struct cred *, caller_context_t *);
130 static int32_t udf_fsync(struct vnode *,
131 int32_t, struct cred *, caller_context_t *);
132 static void udf_inactive(struct vnode *,
133 struct cred *, caller_context_t *);
134 static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
135 static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
136 static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
137 static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
138 caller_context_t *);
139 static int32_t udf_frlock(struct vnode *, int32_t,
140 struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
141 caller_context_t *);
142 static int32_t udf_space(struct vnode *, int32_t,
143 struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
144 static int32_t udf_getpage(struct vnode *, offset_t,
145 size_t, uint32_t *, struct page **, size_t,
146 struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
147 static int32_t udf_putpage(struct vnode *, offset_t,
148 size_t, int32_t, struct cred *, caller_context_t *);
149 static int32_t udf_map(struct vnode *, offset_t, struct as *,
150 caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
151 caller_context_t *);
152 static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
153 caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
154 caller_context_t *);
155 static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
156 caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
157 caller_context_t *);
158 static int32_t udf_l_pathconf(struct vnode *, int32_t,
159 ulong_t *, struct cred *, caller_context_t *);
160 static int32_t udf_pageio(struct vnode *, struct page *,
161 u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
162
163 int32_t ud_getpage_miss(struct vnode *, u_offset_t,
164 size_t, struct seg *, caddr_t, page_t *pl[],
165 size_t, enum seg_rw, int32_t);
166 void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
167 int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
168 int32_t ud_page_fill(struct ud_inode *, page_t *,
169 u_offset_t, uint32_t, u_offset_t *);
170 int32_t ud_iodone(struct buf *);
171 int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
172 int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
173 int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
174 int32_t ud_slave_done(struct buf *);
175
176 /*
177 * Structures to control multiple IO operations to get or put pages
178 * that are backed by discontiguous blocks. The master struct is
179 * a dummy that holds the original bp from pageio_setup. The
180 * slave struct holds the working bp's to do the actual IO. Once
181 * all the slave IOs complete. The master is processed as if a single
182 * IO op has completed.
183 */
184 uint32_t master_index = 0;
185 typedef struct mio_master {
186 kmutex_t mm_mutex; /* protect the fields below */
187 int32_t mm_size;
188 buf_t *mm_bp; /* original bp */
189 int32_t mm_resid; /* bytes remaining to transfer */
190 int32_t mm_error; /* accumulated error from slaves */
191 int32_t mm_index; /* XXX debugging */
192 } mio_master_t;
193
194 typedef struct mio_slave {
195 buf_t ms_buf; /* working buffer for this IO chunk */
196 mio_master_t *ms_ptr; /* pointer to master */
197 } mio_slave_t;
198
199 struct vnodeops *udf_vnodeops;
200
201 const fs_operation_def_t udf_vnodeops_template[] = {
202 VOPNAME_OPEN, { .vop_open = udf_open },
203 VOPNAME_CLOSE, { .vop_close = udf_close },
204 VOPNAME_READ, { .vop_read = udf_read },
205 VOPNAME_WRITE, { .vop_write = udf_write },
206 VOPNAME_IOCTL, { .vop_ioctl = udf_ioctl },
207 VOPNAME_GETATTR, { .vop_getattr = udf_getattr },
208 VOPNAME_SETATTR, { .vop_setattr = udf_setattr },
209 VOPNAME_ACCESS, { .vop_access = udf_access },
210 VOPNAME_LOOKUP, { .vop_lookup = udf_lookup },
211 VOPNAME_CREATE, { .vop_create = udf_create },
212 VOPNAME_REMOVE, { .vop_remove = udf_remove },
213 VOPNAME_LINK, { .vop_link = udf_link },
214 VOPNAME_RENAME, { .vop_rename = udf_rename },
215 VOPNAME_MKDIR, { .vop_mkdir = udf_mkdir },
216 VOPNAME_RMDIR, { .vop_rmdir = udf_rmdir },
217 VOPNAME_READDIR, { .vop_readdir = udf_readdir },
218 VOPNAME_SYMLINK, { .vop_symlink = udf_symlink },
219 VOPNAME_READLINK, { .vop_readlink = udf_readlink },
220 VOPNAME_FSYNC, { .vop_fsync = udf_fsync },
221 VOPNAME_INACTIVE, { .vop_inactive = udf_inactive },
222 VOPNAME_FID, { .vop_fid = udf_fid },
223 VOPNAME_RWLOCK, { .vop_rwlock = udf_rwlock },
224 VOPNAME_RWUNLOCK, { .vop_rwunlock = udf_rwunlock },
225 VOPNAME_SEEK, { .vop_seek = udf_seek },
226 VOPNAME_FRLOCK, { .vop_frlock = udf_frlock },
227 VOPNAME_SPACE, { .vop_space = udf_space },
228 VOPNAME_GETPAGE, { .vop_getpage = udf_getpage },
229 VOPNAME_PUTPAGE, { .vop_putpage = udf_putpage },
230 VOPNAME_MAP, { .vop_map = udf_map },
231 VOPNAME_ADDMAP, { .vop_addmap = udf_addmap },
232 VOPNAME_DELMAP, { .vop_delmap = udf_delmap },
233 VOPNAME_PATHCONF, { .vop_pathconf = udf_l_pathconf },
234 VOPNAME_PAGEIO, { .vop_pageio = udf_pageio },
235 VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support },
236 NULL, NULL
237 };
238
239 /* ARGSUSED */
240 static int32_t
241 udf_open(
242 struct vnode **vpp,
243 int32_t flag,
244 struct cred *cr,
245 caller_context_t *ct)
246 {
247 ud_printf("udf_open\n");
248
249 return (0);
250 }
251
252 /* ARGSUSED */
253 static int32_t
254 udf_close(
255 struct vnode *vp,
256 int32_t flag,
257 int32_t count,
258 offset_t offset,
259 struct cred *cr,
260 caller_context_t *ct)
261 {
262 struct ud_inode *ip = VTOI(vp);
263
264 ud_printf("udf_close\n");
265
266 ITIMES(ip);
267
268 cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
269 cleanshares(vp, ttoproc(curthread)->p_pid);
270
271 /*
272 * Push partially filled cluster at last close.
273 * ``last close'' is approximated because the dnlc
274 * may have a hold on the vnode.
275 */
276 if (vp->v_count <= 2 && vp->v_type != VBAD) {
277 struct ud_inode *ip = VTOI(vp);
278 if (ip->i_delaylen) {
279 (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
280 B_ASYNC | B_FREE, cr);
281 ip->i_delaylen = 0;
282 }
283 }
284
285 return (0);
286 }
287
288 /* ARGSUSED */
289 static int32_t
290 udf_read(
291 struct vnode *vp,
292 struct uio *uiop,
293 int32_t ioflag,
294 struct cred *cr,
295 caller_context_t *ct)
296 {
297 struct ud_inode *ip = VTOI(vp);
298 int32_t error;
299
300 ud_printf("udf_read\n");
301
302 #ifdef __lock_lint
303 rw_enter(&ip->i_rwlock, RW_READER);
304 #endif
305
306 ASSERT(RW_READ_HELD(&ip->i_rwlock));
307
308 if (MANDLOCK(vp, ip->i_char)) {
309 /*
310 * udf_getattr ends up being called by chklock
311 */
312 error = chklock(vp, FREAD, uiop->uio_loffset,
313 uiop->uio_resid, uiop->uio_fmode, ct);
314 if (error) {
315 goto end;
316 }
317 }
318
319 rw_enter(&ip->i_contents, RW_READER);
320 error = ud_rdip(ip, uiop, ioflag, cr);
321 rw_exit(&ip->i_contents);
322
323 end:
324 #ifdef __lock_lint
325 rw_exit(&ip->i_rwlock);
326 #endif
327
328 return (error);
329 }
330
331
332 int32_t ud_WRITES = 1;
333 int32_t ud_HW = 96 * 1024;
334 int32_t ud_LW = 64 * 1024;
335 int32_t ud_throttles = 0;
336
337 /* ARGSUSED */
338 static int32_t
339 udf_write(
340 struct vnode *vp,
341 struct uio *uiop,
342 int32_t ioflag,
343 struct cred *cr,
344 caller_context_t *ct)
345 {
346 struct ud_inode *ip = VTOI(vp);
347 int32_t error = 0;
348
349 ud_printf("udf_write\n");
350
351 #ifdef __lock_lint
352 rw_enter(&ip->i_rwlock, RW_WRITER);
353 #endif
354
355 ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
356
357 if (MANDLOCK(vp, ip->i_char)) {
358 /*
359 * ud_getattr ends up being called by chklock
360 */
361 error = chklock(vp, FWRITE, uiop->uio_loffset,
362 uiop->uio_resid, uiop->uio_fmode, ct);
363 if (error) {
364 goto end;
365 }
366 }
367 /*
368 * Throttle writes.
369 */
370 mutex_enter(&ip->i_tlock);
371 if (ud_WRITES && (ip->i_writes > ud_HW)) {
372 while (ip->i_writes > ud_HW) {
373 ud_throttles++;
374 cv_wait(&ip->i_wrcv, &ip->i_tlock);
375 }
376 }
377 mutex_exit(&ip->i_tlock);
378
379 /*
380 * Write to the file
381 */
382 rw_enter(&ip->i_contents, RW_WRITER);
383 if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
384 /*
385 * In append mode start at end of file.
386 */
387 uiop->uio_loffset = ip->i_size;
388 }
389 error = ud_wrip(ip, uiop, ioflag, cr);
390 rw_exit(&ip->i_contents);
391
392 end:
393 #ifdef __lock_lint
394 rw_exit(&ip->i_rwlock);
395 #endif
396
397 return (error);
398 }
399
400 /* ARGSUSED */
401 static int32_t
402 udf_ioctl(
403 struct vnode *vp,
404 int32_t cmd,
405 intptr_t arg,
406 int32_t flag,
407 struct cred *cr,
408 int32_t *rvalp,
409 caller_context_t *ct)
410 {
411 return (ENOTTY);
412 }
413
414 /* ARGSUSED */
415 static int32_t
416 udf_getattr(
417 struct vnode *vp,
418 struct vattr *vap,
419 int32_t flags,
420 struct cred *cr,
421 caller_context_t *ct)
422 {
423 struct ud_inode *ip = VTOI(vp);
424
425 ud_printf("udf_getattr\n");
426
427 if (vap->va_mask == AT_SIZE) {
428 /*
429 * for performance, if only the size is requested don't bother
430 * with anything else.
431 */
432 vap->va_size = ip->i_size;
433 return (0);
434 }
435
436 rw_enter(&ip->i_contents, RW_READER);
437
438 vap->va_type = vp->v_type;
439 vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
440
441 vap->va_uid = ip->i_uid;
442 vap->va_gid = ip->i_gid;
443 vap->va_fsid = ip->i_dev;
444 vap->va_nodeid = ip->i_icb_lbano;
445 vap->va_nlink = ip->i_nlink;
446 vap->va_size = ip->i_size;
447 vap->va_seq = ip->i_seq;
448 if (vp->v_type == VCHR || vp->v_type == VBLK) {
449 vap->va_rdev = ip->i_rdev;
450 } else {
451 vap->va_rdev = 0;
452 }
453
454 mutex_enter(&ip->i_tlock);
455 ITIMES_NOLOCK(ip); /* mark correct time in inode */
456 vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
457 vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
458 vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
459 vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
460 vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
461 vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
462 mutex_exit(&ip->i_tlock);
463
464 switch (ip->i_type) {
465 case VBLK:
466 vap->va_blksize = MAXBSIZE;
467 break;
468 case VCHR:
469 vap->va_blksize = MAXBSIZE;
470 break;
471 default:
472 vap->va_blksize = ip->i_udf->udf_lbsize;
473 break;
474 }
475 vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
476
477 rw_exit(&ip->i_contents);
478
479 return (0);
480 }
481
482 static int
483 ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
484 {
485 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
486 }
487
488 /*ARGSUSED4*/
489 static int32_t
490 udf_setattr(
491 struct vnode *vp,
492 struct vattr *vap,
493 int32_t flags,
494 struct cred *cr,
495 caller_context_t *ct)
496 {
497 int32_t error = 0;
498 uint32_t mask = vap->va_mask;
499 struct ud_inode *ip;
500 timestruc_t now;
501 struct vattr ovap;
502
503 ud_printf("udf_setattr\n");
504
505 ip = VTOI(vp);
506
507 /*
508 * not updates allowed to 4096 files
509 */
510 if (ip->i_astrat == STRAT_TYPE4096) {
511 return (EINVAL);
512 }
513
514 /*
515 * Cannot set these attributes
516 */
517 if (mask & AT_NOSET) {
518 return (EINVAL);
519 }
520
521 rw_enter(&ip->i_rwlock, RW_WRITER);
522 rw_enter(&ip->i_contents, RW_WRITER);
523
524 ovap.va_uid = ip->i_uid;
525 ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
526 error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
527 ud_iaccess_vmode, ip);
528 if (error)
529 goto update_inode;
530
531 mask = vap->va_mask;
532 /*
533 * Change file access modes.
534 */
535 if (mask & AT_MODE) {
536 ip->i_perm = VA2UD_PERM(vap->va_mode);
537 ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
538 mutex_enter(&ip->i_tlock);
539 ip->i_flag |= ICHG;
540 mutex_exit(&ip->i_tlock);
541 }
542 if (mask & (AT_UID|AT_GID)) {
543 if (mask & AT_UID) {
544 ip->i_uid = vap->va_uid;
545 }
546 if (mask & AT_GID) {
547 ip->i_gid = vap->va_gid;
548 }
549 mutex_enter(&ip->i_tlock);
550 ip->i_flag |= ICHG;
551 mutex_exit(&ip->i_tlock);
552 }
553 /*
554 * Truncate file. Must have write permission and not be a directory.
555 */
556 if (mask & AT_SIZE) {
557 if (vp->v_type == VDIR) {
558 error = EISDIR;
559 goto update_inode;
560 }
561 if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
562 goto update_inode;
563 }
564 if (vap->va_size > MAXOFFSET_T) {
565 error = EFBIG;
566 goto update_inode;
567 }
568 if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
569 goto update_inode;
570 }
571
572 if (vap->va_size == 0) {
573 vnevent_truncate(vp, ct);
574 } else {
575 vnevent_resize(vp, ct);
576 }
577 }
578 /*
579 * Change file access or modified times.
580 */
581 if (mask & (AT_ATIME|AT_MTIME)) {
582 mutex_enter(&ip->i_tlock);
583 if (mask & AT_ATIME) {
584 ip->i_atime.tv_sec = vap->va_atime.tv_sec;
585 ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
586 ip->i_flag &= ~IACC;
587 }
588 if (mask & AT_MTIME) {
589 ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
590 ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
591 gethrestime(&now);
592 ip->i_ctime.tv_sec = now.tv_sec;
593 ip->i_ctime.tv_nsec = now.tv_nsec;
594 ip->i_flag &= ~(IUPD|ICHG);
595 ip->i_flag |= IMODTIME;
596 }
597 ip->i_flag |= IMOD;
598 mutex_exit(&ip->i_tlock);
599 }
600
601 update_inode:
602 if (curthread->t_flag & T_DONTPEND) {
603 ud_iupdat(ip, 1);
604 } else {
605 ITIMES_NOLOCK(ip);
606 }
607 rw_exit(&ip->i_contents);
608 rw_exit(&ip->i_rwlock);
609
610 return (error);
611 }
612
613 /* ARGSUSED */
614 static int32_t
615 udf_access(
616 struct vnode *vp,
617 int32_t mode,
618 int32_t flags,
619 struct cred *cr,
620 caller_context_t *ct)
621 {
622 struct ud_inode *ip = VTOI(vp);
623
624 ud_printf("udf_access\n");
625
626 if (ip->i_udf == NULL) {
627 return (EIO);
628 }
629
630 return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
631 }
632
633 int32_t udfs_stickyhack = 1;
634
635 /* ARGSUSED */
636 static int32_t
637 udf_lookup(
638 struct vnode *dvp,
639 char *nm,
640 struct vnode **vpp,
641 struct pathname *pnp,
642 int32_t flags,
643 struct vnode *rdir,
644 struct cred *cr,
645 caller_context_t *ct,
646 int *direntflags,
647 pathname_t *realpnp)
648 {
649 int32_t error;
650 struct vnode *vp;
651 struct ud_inode *ip, *xip;
652
653 ud_printf("udf_lookup\n");
654 /*
655 * Null component name is a synonym for directory being searched.
656 */
657 if (*nm == '\0') {
658 VN_HOLD(dvp);
659 *vpp = dvp;
660 error = 0;
661 goto out;
662 }
663
664 /*
665 * Fast path: Check the directory name lookup cache.
666 */
667 ip = VTOI(dvp);
668 if (vp = dnlc_lookup(dvp, nm)) {
669 /*
670 * Check accessibility of directory.
671 */
672 if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
673 VN_RELE(vp);
674 }
675 xip = VTOI(vp);
676 } else {
677 error = ud_dirlook(ip, nm, &xip, cr, 1);
678 ITIMES(ip);
679 }
680
681 if (error == 0) {
682 ip = xip;
683 *vpp = ITOV(ip);
684 if ((ip->i_type != VDIR) &&
685 (ip->i_char & ISVTX) &&
686 ((ip->i_perm & IEXEC) == 0) &&
687 udfs_stickyhack) {
688 mutex_enter(&(*vpp)->v_lock);
689 (*vpp)->v_flag |= VISSWAP;
690 mutex_exit(&(*vpp)->v_lock);
691 }
692 ITIMES(ip);
693 /*
694 * If vnode is a device return special vnode instead.
695 */
696 if (IS_DEVVP(*vpp)) {
697 struct vnode *newvp;
698 newvp = specvp(*vpp, (*vpp)->v_rdev,
699 (*vpp)->v_type, cr);
700 VN_RELE(*vpp);
701 if (newvp == NULL) {
702 error = ENOSYS;
703 } else {
704 *vpp = newvp;
705 }
706 }
707 }
708 out:
709 return (error);
710 }
711
712 /* ARGSUSED */
713 static int32_t
714 udf_create(
715 struct vnode *dvp,
716 char *name,
717 struct vattr *vap,
718 enum vcexcl excl,
719 int32_t mode,
720 struct vnode **vpp,
721 struct cred *cr,
722 int32_t flag,
723 caller_context_t *ct,
724 vsecattr_t *vsecp)
725 {
726 int32_t error;
727 struct ud_inode *ip = VTOI(dvp), *xip;
728
729 ud_printf("udf_create\n");
730
731 if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
732 vap->va_mode &= ~VSVTX;
733
734 if (*name == '\0') {
735 /*
736 * Null component name refers to the directory itself.
737 */
738 VN_HOLD(dvp);
739 ITIMES(ip);
740 error = EEXIST;
741 } else {
742 xip = NULL;
743 rw_enter(&ip->i_rwlock, RW_WRITER);
744 error = ud_direnter(ip, name, DE_CREATE,
745 (struct ud_inode *)0, (struct ud_inode *)0,
746 vap, &xip, cr, ct);
747 rw_exit(&ip->i_rwlock);
748 ITIMES(ip);
749 ip = xip;
750 }
751 #ifdef __lock_lint
752 rw_enter(&ip->i_contents, RW_WRITER);
753 #else
754 if (ip != NULL) {
755 rw_enter(&ip->i_contents, RW_WRITER);
756 }
757 #endif
758
759 /*
760 * If the file already exists and this is a non-exclusive create,
761 * check permissions and allow access for non-directories.
762 * Read-only create of an existing directory is also allowed.
763 * We fail an exclusive create of anything which already exists.
764 */
765 if (error == EEXIST) {
766 if (excl == NONEXCL) {
767 if ((ip->i_type == VDIR) && (mode & VWRITE)) {
768 error = EISDIR;
769 } else if (mode) {
770 error = ud_iaccess(ip,
771 UD_UPERM2DPERM(mode), cr, 0);
772 } else {
773 error = 0;
774 }
775 }
776 if (error) {
777 rw_exit(&ip->i_contents);
778 VN_RELE(ITOV(ip));
779 goto out;
780 } else if ((ip->i_type == VREG) &&
781 (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
782 /*
783 * Truncate regular files, if requested by caller.
784 * Grab i_rwlock to make sure no one else is
785 * currently writing to the file (we promised
786 * bmap we would do this).
787 * Must get the locks in the correct order.
788 */
789 if (ip->i_size == 0) {
790 ip->i_flag |= ICHG | IUPD;
791 } else {
792 rw_exit(&ip->i_contents);
793 rw_enter(&ip->i_rwlock, RW_WRITER);
794 rw_enter(&ip->i_contents, RW_WRITER);
795 (void) ud_itrunc(ip, 0, 0, cr);
796 rw_exit(&ip->i_rwlock);
797 }
798 vnevent_create(ITOV(ip), ct);
799 }
800 }
801
802 if (error == 0) {
803 *vpp = ITOV(ip);
804 ITIMES(ip);
805 }
806 #ifdef __lock_lint
807 rw_exit(&ip->i_contents);
808 #else
809 if (ip != NULL) {
810 rw_exit(&ip->i_contents);
811 }
812 #endif
813 if (error) {
814 goto out;
815 }
816
817 /*
818 * If vnode is a device return special vnode instead.
819 */
820 if (!error && IS_DEVVP(*vpp)) {
821 struct vnode *newvp;
822
823 newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
824 VN_RELE(*vpp);
825 if (newvp == NULL) {
826 error = ENOSYS;
827 goto out;
828 }
829 *vpp = newvp;
830 }
831 out:
832 return (error);
833 }
834
835 /* ARGSUSED */
836 static int32_t
837 udf_remove(
838 struct vnode *vp,
839 char *nm,
840 struct cred *cr,
841 caller_context_t *ct,
842 int flags)
843 {
844 int32_t error;
845 struct ud_inode *ip = VTOI(vp);
846
847 ud_printf("udf_remove\n");
848
849 rw_enter(&ip->i_rwlock, RW_WRITER);
850 error = ud_dirremove(ip, nm,
851 (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
852 rw_exit(&ip->i_rwlock);
853 ITIMES(ip);
854
855 return (error);
856 }
857
858 /* ARGSUSED */
859 static int32_t
860 udf_link(
861 struct vnode *tdvp,
862 struct vnode *svp,
863 char *tnm,
864 struct cred *cr,
865 caller_context_t *ct,
866 int flags)
867 {
868 int32_t error;
869 struct vnode *realvp;
870 struct ud_inode *sip;
871 struct ud_inode *tdp;
872
873 ud_printf("udf_link\n");
874 if (VOP_REALVP(svp, &realvp, ct) == 0) {
875 svp = realvp;
876 }
877
878 /*
879 * Do not allow links to directories
880 */
881 if (svp->v_type == VDIR) {
882 return (EPERM);
883 }
884
885 sip = VTOI(svp);
886
887 if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
888 return (EPERM);
889
890 tdp = VTOI(tdvp);
891
892 rw_enter(&tdp->i_rwlock, RW_WRITER);
893 error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
894 sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
895 rw_exit(&tdp->i_rwlock);
896 ITIMES(sip);
897 ITIMES(tdp);
898
899 if (error == 0) {
900 vnevent_link(svp, ct);
901 }
902
903 return (error);
904 }
905
906 /* ARGSUSED */
907 static int32_t
908 udf_rename(
909 struct vnode *sdvp,
910 char *snm,
911 struct vnode *tdvp,
912 char *tnm,
913 struct cred *cr,
914 caller_context_t *ct,
915 int flags)
916 {
917 int32_t error = 0;
918 struct udf_vfs *udf_vfsp;
919 struct ud_inode *sip; /* source inode */
920 struct ud_inode *tip; /* target inode */
921 struct ud_inode *sdp, *tdp; /* source and target parent inode */
922 struct vnode *realvp;
923
924 ud_printf("udf_rename\n");
925
926 if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
927 tdvp = realvp;
928 }
929
930 sdp = VTOI(sdvp);
931 tdp = VTOI(tdvp);
932
933 udf_vfsp = sdp->i_udf;
934
935 mutex_enter(&udf_vfsp->udf_rename_lck);
936 /*
937 * Look up inode of file we're supposed to rename.
938 */
939 if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
940 mutex_exit(&udf_vfsp->udf_rename_lck);
941 return (error);
942 }
943 /*
944 * be sure this is not a directory with another file system mounted
945 * over it. If it is just give up the locks, and return with
946 * EBUSY
947 */
948 if (vn_mountedvfs(ITOV(sip)) != NULL) {
949 error = EBUSY;
950 goto errout;
951 }
952 /*
953 * Make sure we can delete the source entry. This requires
954 * write permission on the containing directory. If that
955 * directory is "sticky" it further requires (except for
956 * privileged users) that the user own the directory or the
957 * source entry, or else have permission to write the source
958 * entry.
959 */
960 rw_enter(&sdp->i_contents, RW_READER);
961 rw_enter(&sip->i_contents, RW_READER);
962 if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
963 (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
964 rw_exit(&sip->i_contents);
965 rw_exit(&sdp->i_contents);
966 ITIMES(sip);
967 goto errout;
968 }
969
970 /*
971 * Check for renaming '.' or '..' or alias of '.'
972 */
973 if ((strcmp(snm, ".") == 0) ||
974 (strcmp(snm, "..") == 0) ||
975 (sdp == sip)) {
976 error = EINVAL;
977 rw_exit(&sip->i_contents);
978 rw_exit(&sdp->i_contents);
979 goto errout;
980 }
981
982 rw_exit(&sip->i_contents);
983 rw_exit(&sdp->i_contents);
984
985 if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
986 vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
987 VN_RELE(ITOV(tip));
988 }
989
990 /* Notify the target dir. if not the same as the source dir. */
991 if (sdvp != tdvp)
992 vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
993
994 vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
995
996 /*
997 * Link source to the target.
998 */
999 rw_enter(&tdp->i_rwlock, RW_WRITER);
1000 if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
1001 (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
1002 /*
1003 * ESAME isn't really an error; it indicates that the
1004 * operation should not be done because the source and target
1005 * are the same file, but that no error should be reported.
1006 */
1007 if (error == ESAME) {
1008 error = 0;
1009 }
1010 rw_exit(&tdp->i_rwlock);
1011 goto errout;
1012 }
1013 rw_exit(&tdp->i_rwlock);
1014
1015 rw_enter(&sdp->i_rwlock, RW_WRITER);
1016 /*
1017 * Unlink the source.
1018 * Remove the source entry. ud_dirremove() checks that the entry
1019 * still reflects sip, and returns an error if it doesn't.
1020 * If the entry has changed just forget about it. Release
1021 * the source inode.
1022 */
1023 if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
1024 DR_RENAME, cr, ct)) == ENOENT) {
1025 error = 0;
1026 }
1027 rw_exit(&sdp->i_rwlock);
1028
1029 if (error == 0) {
1030 vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1031 /*
1032 * vnevent_rename_dest and vnevent_rename_dest_dir are called
1033 * in ud_direnter().
1034 */
1035 }
1036
1037 errout:
1038 ITIMES(sdp);
1039 ITIMES(tdp);
1040 VN_RELE(ITOV(sip));
1041 mutex_exit(&udf_vfsp->udf_rename_lck);
1042
1043 return (error);
1044 }
1045
1046 /* ARGSUSED */
1047 static int32_t
1048 udf_mkdir(
1049 struct vnode *dvp,
1050 char *dirname,
1051 struct vattr *vap,
1052 struct vnode **vpp,
1053 struct cred *cr,
1054 caller_context_t *ct,
1055 int flags,
1056 vsecattr_t *vsecp)
1057 {
1058 int32_t error;
1059 struct ud_inode *ip;
1060 struct ud_inode *xip;
1061
1062 ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1063
1064 ud_printf("udf_mkdir\n");
1065
1066 ip = VTOI(dvp);
1067 rw_enter(&ip->i_rwlock, RW_WRITER);
1068 error = ud_direnter(ip, dirname, DE_MKDIR,
1069 (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1070 rw_exit(&ip->i_rwlock);
1071 ITIMES(ip);
1072 if (error == 0) {
1073 ip = xip;
1074 *vpp = ITOV(ip);
1075 ITIMES(ip);
1076 } else if (error == EEXIST) {
1077 ITIMES(xip);
1078 VN_RELE(ITOV(xip));
1079 }
1080
1081 return (error);
1082 }
1083
1084 /* ARGSUSED */
1085 static int32_t
1086 udf_rmdir(
1087 struct vnode *vp,
1088 char *nm,
1089 struct vnode *cdir,
1090 struct cred *cr,
1091 caller_context_t *ct,
1092 int flags)
1093 {
1094 int32_t error;
1095 struct ud_inode *ip = VTOI(vp);
1096
1097 ud_printf("udf_rmdir\n");
1098
1099 rw_enter(&ip->i_rwlock, RW_WRITER);
1100 error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1101 cr, ct);
1102 rw_exit(&ip->i_rwlock);
1103 ITIMES(ip);
1104
1105 return (error);
1106 }
1107
1108 /* ARGSUSED */
1109 static int32_t
1110 udf_readdir(
1111 struct vnode *vp,
1112 struct uio *uiop,
1113 struct cred *cr,
1114 int32_t *eofp,
1115 caller_context_t *ct,
1116 int flags)
1117 {
1118 struct ud_inode *ip;
1119 struct dirent64 *nd;
1120 struct udf_vfs *udf_vfsp;
1121 int32_t error = 0, len, outcount = 0;
1122 uint32_t dirsiz, offset;
1123 uint32_t bufsize, ndlen, dummy;
1124 caddr_t outbuf;
1125 caddr_t outb, end_outb;
1126 struct iovec *iovp;
1127
1128 uint8_t *dname;
1129 int32_t length;
1130
1131 uint8_t *buf = NULL;
1132
1133 struct fbuf *fbp = NULL;
1134 struct file_id *fid;
1135 uint8_t *name;
1136
1137
1138 ud_printf("udf_readdir\n");
1139
1140 ip = VTOI(vp);
1141 udf_vfsp = ip->i_udf;
1142
1143 dirsiz = ip->i_size;
1144 if ((uiop->uio_offset >= dirsiz) ||
1145 (ip->i_nlink <= 0)) {
1146 if (eofp) {
1147 *eofp = 1;
1148 }
1149 return (0);
1150 }
1151
1152 offset = uiop->uio_offset;
1153 iovp = uiop->uio_iov;
1154 bufsize = iovp->iov_len;
1155
1156 outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1157 end_outb = outb + bufsize;
1158 nd = (struct dirent64 *)outbuf;
1159
1160 dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1161 buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1162
1163 if (offset == 0) {
1164 len = DIRENT64_RECLEN(1);
1165 if (((caddr_t)nd + len) >= end_outb) {
1166 error = EINVAL;
1167 goto end;
1168 }
1169 nd->d_ino = ip->i_icb_lbano;
1170 nd->d_reclen = (uint16_t)len;
1171 nd->d_off = 0x10;
1172 nd->d_name[0] = '.';
1173 bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1174 nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1175 outcount++;
1176 } else if (offset == 0x10) {
1177 offset = 0;
1178 }
1179
1180 while (offset < dirsiz) {
1181 error = ud_get_next_fid(ip, &fbp,
1182 offset, &fid, &name, buf);
1183 if (error != 0) {
1184 break;
1185 }
1186
1187 if ((fid->fid_flags & FID_DELETED) == 0) {
1188 if (fid->fid_flags & FID_PARENT) {
1189
1190 len = DIRENT64_RECLEN(2);
1191 if (((caddr_t)nd + len) >= end_outb) {
1192 error = EINVAL;
1193 break;
1194 }
1195
1196 nd->d_ino = ip->i_icb_lbano;
1197 nd->d_reclen = (uint16_t)len;
1198 nd->d_off = offset + FID_LEN(fid);
1199 nd->d_name[0] = '.';
1200 nd->d_name[1] = '.';
1201 bzero(&nd->d_name[2],
1202 DIRENT64_NAMELEN(len) - 2);
1203 nd = (struct dirent64 *)
1204 ((char *)nd + nd->d_reclen);
1205 } else {
1206 if ((error = ud_uncompress(fid->fid_idlen,
1207 &length, name, dname)) != 0) {
1208 break;
1209 }
1210 if (length == 0) {
1211 offset += FID_LEN(fid);
1212 continue;
1213 }
1214 len = DIRENT64_RECLEN(length);
1215 if (((caddr_t)nd + len) >= end_outb) {
1216 if (!outcount) {
1217 error = EINVAL;
1218 }
1219 break;
1220 }
1221 (void) strncpy(nd->d_name,
1222 (caddr_t)dname, length);
1223 bzero(&nd->d_name[length],
1224 DIRENT64_NAMELEN(len) - length);
1225 nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1226 SWAP_16(fid->fid_icb.lad_ext_prn),
1227 SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1228 &dummy);
1229 nd->d_reclen = (uint16_t)len;
1230 nd->d_off = offset + FID_LEN(fid);
1231 nd = (struct dirent64 *)
1232 ((char *)nd + nd->d_reclen);
1233 }
1234 outcount++;
1235 }
1236
1237 offset += FID_LEN(fid);
1238 }
1239
1240 end:
1241 if (fbp != NULL) {
1242 fbrelse(fbp, S_OTHER);
1243 }
1244 ndlen = ((char *)nd - outbuf);
1245 /*
1246 * In case of error do not call uiomove.
1247 * Return the error to the caller.
1248 */
1249 if ((error == 0) && (ndlen != 0)) {
1250 error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1251 uiop->uio_offset = offset;
1252 }
1253 kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1254 kmem_free((caddr_t)dname, 1024);
1255 kmem_free(outbuf, (uint32_t)bufsize);
1256 if (eofp && error == 0) {
1257 *eofp = (uiop->uio_offset >= dirsiz);
1258 }
1259 return (error);
1260 }
1261
1262 /* ARGSUSED */
1263 static int32_t
1264 udf_symlink(
1265 struct vnode *dvp,
1266 char *linkname,
1267 struct vattr *vap,
1268 char *target,
1269 struct cred *cr,
1270 caller_context_t *ct,
1271 int flags)
1272 {
1273 int32_t error = 0, outlen;
1274 uint32_t ioflag = 0;
1275 struct ud_inode *ip, *dip = VTOI(dvp);
1276
1277 struct path_comp *pc;
1278 int8_t *dname = NULL, *uname = NULL, *sp;
1279
1280 ud_printf("udf_symlink\n");
1281
1282 ip = (struct ud_inode *)0;
1283 vap->va_type = VLNK;
1284 vap->va_rdev = 0;
1285
1286 rw_enter(&dip->i_rwlock, RW_WRITER);
1287 error = ud_direnter(dip, linkname, DE_CREATE,
1288 (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1289 rw_exit(&dip->i_rwlock);
1290 if (error == 0) {
1291 dname = kmem_zalloc(1024, KM_SLEEP);
1292 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1293
1294 pc = (struct path_comp *)uname;
1295 /*
1296 * If the first character in target is "/"
1297 * then skip it and create entry for it
1298 */
1299 if (*target == '/') {
1300 pc->pc_type = 2;
1301 pc->pc_len = 0;
1302 pc = (struct path_comp *)(((char *)pc) + 4);
1303 while (*target == '/') {
1304 target++;
1305 }
1306 }
1307
1308 while (*target != NULL) {
1309 sp = target;
1310 while ((*target != '/') && (*target != '\0')) {
1311 target ++;
1312 }
1313 /*
1314 * We got the next component of the
1315 * path name. Create path_comp of
1316 * appropriate type
1317 */
1318 if (((target - sp) == 1) && (*sp == '.')) {
1319 /*
1320 * Dot entry.
1321 */
1322 pc->pc_type = 4;
1323 pc = (struct path_comp *)(((char *)pc) + 4);
1324 } else if (((target - sp) == 2) &&
1325 (*sp == '.') && ((*(sp + 1)) == '.')) {
1326 /*
1327 * DotDot entry.
1328 */
1329 pc->pc_type = 3;
1330 pc = (struct path_comp *)(((char *)pc) + 4);
1331 } else {
1332 /*
1333 * convert the user given name
1334 * into appropriate form to be put
1335 * on the media
1336 */
1337 outlen = 1024; /* set to size of dname */
1338 if (error = ud_compress(target - sp, &outlen,
1339 (uint8_t *)sp, (uint8_t *)dname)) {
1340 break;
1341 }
1342 pc->pc_type = 5;
1343 /* LINTED */
1344 pc->pc_len = outlen;
1345 dname[outlen] = '\0';
1346 (void) strcpy((char *)pc->pc_id, dname);
1347 pc = (struct path_comp *)
1348 (((char *)pc) + 4 + outlen);
1349 }
1350 while (*target == '/') {
1351 target++;
1352 }
1353 if (*target == NULL) {
1354 break;
1355 }
1356 }
1357
1358 rw_enter(&ip->i_contents, RW_WRITER);
1359 if (error == 0) {
1360 ioflag = FWRITE;
1361 if (curthread->t_flag & T_DONTPEND) {
1362 ioflag |= FDSYNC;
1363 }
1364 error = ud_rdwri(UIO_WRITE, ioflag, ip,
1365 uname, ((int8_t *)pc) - uname,
1366 (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1367 }
1368 if (error) {
1369 ud_idrop(ip);
1370 rw_exit(&ip->i_contents);
1371 rw_enter(&dip->i_rwlock, RW_WRITER);
1372 (void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1373 (struct vnode *)0, DR_REMOVE, cr, ct);
1374 rw_exit(&dip->i_rwlock);
1375 goto update_inode;
1376 }
1377 rw_exit(&ip->i_contents);
1378 }
1379
1380 if ((error == 0) || (error == EEXIST)) {
1381 VN_RELE(ITOV(ip));
1382 }
1383
1384 update_inode:
1385 ITIMES(VTOI(dvp));
1386 if (uname != NULL) {
1387 kmem_free(uname, PAGESIZE);
1388 }
1389 if (dname != NULL) {
1390 kmem_free(dname, 1024);
1391 }
1392
1393 return (error);
1394 }
1395
1396 /* ARGSUSED */
1397 static int32_t
1398 udf_readlink(
1399 struct vnode *vp,
1400 struct uio *uiop,
1401 struct cred *cr,
1402 caller_context_t *ct)
1403 {
1404 int32_t error = 0, off, id_len, size, len;
1405 int8_t *dname = NULL, *uname = NULL;
1406 struct ud_inode *ip;
1407 struct fbuf *fbp = NULL;
1408 struct path_comp *pc;
1409
1410 ud_printf("udf_readlink\n");
1411
1412 if (vp->v_type != VLNK) {
1413 return (EINVAL);
1414 }
1415
1416 ip = VTOI(vp);
1417 size = ip->i_size;
1418 if (size > PAGESIZE) {
1419 return (EIO);
1420 }
1421
1422 if (size == 0) {
1423 return (0);
1424 }
1425
1426 dname = kmem_zalloc(1024, KM_SLEEP);
1427 uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1428
1429 rw_enter(&ip->i_contents, RW_READER);
1430
1431 if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1432 goto end;
1433 }
1434
1435 off = 0;
1436
1437 while (off < size) {
1438 pc = (struct path_comp *)(fbp->fb_addr + off);
1439 switch (pc->pc_type) {
1440 case 1 :
1441 (void) strcpy(uname, ip->i_udf->udf_fsmnt);
1442 (void) strcat(uname, "/");
1443 break;
1444 case 2 :
1445 if (pc->pc_len != 0) {
1446 goto end;
1447 }
1448 uname[0] = '/';
1449 uname[1] = '\0';
1450 break;
1451 case 3 :
1452 (void) strcat(uname, "../");
1453 break;
1454 case 4 :
1455 (void) strcat(uname, "./");
1456 break;
1457 case 5 :
1458 if ((error = ud_uncompress(pc->pc_len, &id_len,
1459 pc->pc_id, (uint8_t *)dname)) != 0) {
1460 break;
1461 }
1462 dname[id_len] = '\0';
1463 (void) strcat(uname, dname);
1464 (void) strcat(uname, "/");
1465 break;
1466 default :
1467 error = EINVAL;
1468 goto end;
1469 }
1470 off += 4 + pc->pc_len;
1471 }
1472 len = strlen(uname) - 1;
1473 if (uname[len] == '/') {
1474 if (len == 0) {
1475 /*
1476 * special case link to /
1477 */
1478 len = 1;
1479 } else {
1480 uname[len] = '\0';
1481 }
1482 }
1483
1484 error = uiomove(uname, len, UIO_READ, uiop);
1485
1486 ITIMES(ip);
1487
1488 end:
1489 if (fbp != NULL) {
1490 fbrelse(fbp, S_OTHER);
1491 }
1492 rw_exit(&ip->i_contents);
1493 if (uname != NULL) {
1494 kmem_free(uname, PAGESIZE);
1495 }
1496 if (dname != NULL) {
1497 kmem_free(dname, 1024);
1498 }
1499 return (error);
1500 }
1501
1502 /* ARGSUSED */
1503 static int32_t
1504 udf_fsync(
1505 struct vnode *vp,
1506 int32_t syncflag,
1507 struct cred *cr,
1508 caller_context_t *ct)
1509 {
1510 int32_t error = 0;
1511 struct ud_inode *ip = VTOI(vp);
1512
1513 ud_printf("udf_fsync\n");
1514
1515 rw_enter(&ip->i_contents, RW_WRITER);
1516 if (!(IS_SWAPVP(vp))) {
1517 error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1518 }
1519 if (error == 0) {
1520 error = ud_sync_indir(ip);
1521 }
1522 ITIMES(ip); /* XXX: is this necessary ??? */
1523 rw_exit(&ip->i_contents);
1524
1525 return (error);
1526 }
1527
1528 /* ARGSUSED */
1529 static void
1530 udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1531 {
1532 ud_printf("udf_iinactive\n");
1533
1534 ud_iinactive(VTOI(vp), cr);
1535 }
1536
1537 /* ARGSUSED */
1538 static int32_t
1539 udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1540 {
1541 struct udf_fid *udfidp;
1542 struct ud_inode *ip = VTOI(vp);
1543
1544 ud_printf("udf_fid\n");
1545
1546 if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1547 fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1548 return (ENOSPC);
1549 }
1550
1551 udfidp = (struct udf_fid *)fidp;
1552 bzero((char *)udfidp, sizeof (struct udf_fid));
1553 rw_enter(&ip->i_contents, RW_READER);
1554 udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1555 udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1556 udfidp->udfid_prn = ip->i_icb_prn;
1557 udfidp->udfid_icb_lbn = ip->i_icb_block;
1558 rw_exit(&ip->i_contents);
1559
1560 return (0);
1561 }
1562
1563 /* ARGSUSED2 */
1564 static int
1565 udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1566 {
1567 struct ud_inode *ip = VTOI(vp);
1568
1569 ud_printf("udf_rwlock\n");
1570
1571 if (write_lock) {
1572 rw_enter(&ip->i_rwlock, RW_WRITER);
1573 } else {
1574 rw_enter(&ip->i_rwlock, RW_READER);
1575 }
1576 #ifdef __lock_lint
1577 rw_exit(&ip->i_rwlock);
1578 #endif
1579 return (write_lock);
1580 }
1581
1582 /* ARGSUSED */
1583 static void
1584 udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1585 {
1586 struct ud_inode *ip = VTOI(vp);
1587
1588 ud_printf("udf_rwunlock\n");
1589
1590 #ifdef __lock_lint
1591 rw_enter(&ip->i_rwlock, RW_WRITER);
1592 #endif
1593
1594 rw_exit(&ip->i_rwlock);
1595
1596 }
1597
1598 /* ARGSUSED */
1599 static int32_t
1600 udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1601 {
1602 return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1603 }
1604
1605 static int32_t
1606 udf_frlock(
1607 struct vnode *vp,
1608 int32_t cmd,
1609 struct flock64 *bfp,
1610 int32_t flag,
1611 offset_t offset,
1612 struct flk_callback *flk_cbp,
1613 cred_t *cr,
1614 caller_context_t *ct)
1615 {
1616 struct ud_inode *ip = VTOI(vp);
1617
1618 ud_printf("udf_frlock\n");
1619
1620 /*
1621 * If file is being mapped, disallow frlock.
1622 * XXX I am not holding tlock while checking i_mapcnt because the
1623 * current locking strategy drops all locks before calling fs_frlock.
1624 * So, mapcnt could change before we enter fs_frlock making is
1625 * meaningless to have held tlock in the first place.
1626 */
1627 if ((ip->i_mapcnt > 0) &&
1628 (MANDLOCK(vp, ip->i_char))) {
1629 return (EAGAIN);
1630 }
1631
1632 return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1633 }
1634
1635 /*ARGSUSED6*/
1636 static int32_t
1637 udf_space(
1638 struct vnode *vp,
1639 int32_t cmd,
1640 struct flock64 *bfp,
1641 int32_t flag,
1642 offset_t offset,
1643 cred_t *cr,
1644 caller_context_t *ct)
1645 {
1646 int32_t error = 0;
1647
1648 ud_printf("udf_space\n");
1649
1650 if (cmd != F_FREESP) {
1651 error = EINVAL;
1652 } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1653 error = ud_freesp(vp, bfp, flag, cr);
1654
1655 if (error == 0) {
1656 if (bfp->l_start == 0) {
1657 vnevent_truncate(vp, ct);
1658 } else {
1659 vnevent_resize(vp, ct);
1660 }
1661 }
1662 }
1663
1664 return (error);
1665 }
1666
1667 /* ARGSUSED */
1668 static int32_t
1669 udf_getpage(
1670 struct vnode *vp,
1671 offset_t off,
1672 size_t len,
1673 uint32_t *protp,
1674 struct page **plarr,
1675 size_t plsz,
1676 struct seg *seg,
1677 caddr_t addr,
1678 enum seg_rw rw,
1679 struct cred *cr,
1680 caller_context_t *ct)
1681 {
1682 struct ud_inode *ip = VTOI(vp);
1683 int32_t error, has_holes, beyond_eof, seqmode, dolock;
1684 int32_t pgsize = PAGESIZE;
1685 struct udf_vfs *udf_vfsp = ip->i_udf;
1686 page_t **pl;
1687 u_offset_t pgoff, eoff, uoff;
1688 krw_t rwtype;
1689 caddr_t pgaddr;
1690
1691 ud_printf("udf_getpage\n");
1692
1693 uoff = (u_offset_t)off; /* type conversion */
1694 if (protp) {
1695 *protp = PROT_ALL;
1696 }
1697 if (vp->v_flag & VNOMAP) {
1698 return (ENOSYS);
1699 }
1700 seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1701
1702 rwtype = RW_READER;
1703 dolock = (rw_owner(&ip->i_contents) != curthread);
1704 retrylock:
1705 #ifdef __lock_lint
1706 rw_enter(&ip->i_contents, rwtype);
1707 #else
1708 if (dolock) {
1709 rw_enter(&ip->i_contents, rwtype);
1710 }
1711 #endif
1712
1713 /*
1714 * We may be getting called as a side effect of a bmap using
1715 * fbread() when the blocks might be being allocated and the
1716 * size has not yet been up'ed. In this case we want to be
1717 * able to return zero pages if we get back UDF_HOLE from
1718 * calling bmap for a non write case here. We also might have
1719 * to read some frags from the disk into a page if we are
1720 * extending the number of frags for a given lbn in bmap().
1721 */
1722 beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1723 if (beyond_eof && seg != segkmap) {
1724 #ifdef __lock_lint
1725 rw_exit(&ip->i_contents);
1726 #else
1727 if (dolock) {
1728 rw_exit(&ip->i_contents);
1729 }
1730 #endif
1731 return (EFAULT);
1732 }
1733
1734 /*
1735 * Must hold i_contents lock throughout the call to pvn_getpages
1736 * since locked pages are returned from each call to ud_getapage.
1737 * Must *not* return locked pages and then try for contents lock
1738 * due to lock ordering requirements (inode > page)
1739 */
1740
1741 has_holes = ud_bmap_has_holes(ip);
1742
1743 if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1744 int32_t blk_size, count;
1745 u_offset_t offset;
1746
1747 /*
1748 * We must acquire the RW_WRITER lock in order to
1749 * call bmap_write().
1750 */
1751 if (dolock && rwtype == RW_READER) {
1752 rwtype = RW_WRITER;
1753
1754 if (!rw_tryupgrade(&ip->i_contents)) {
1755
1756 rw_exit(&ip->i_contents);
1757
1758 goto retrylock;
1759 }
1760 }
1761
1762 /*
1763 * May be allocating disk blocks for holes here as
1764 * a result of mmap faults. write(2) does the bmap_write
1765 * in rdip/wrip, not here. We are not dealing with frags
1766 * in this case.
1767 */
1768 offset = uoff;
1769 while ((offset < uoff + len) &&
1770 (offset < ip->i_size)) {
1771 /*
1772 * the variable "bnp" is to simplify the expression for
1773 * the compiler; * just passing in &bn to bmap_write
1774 * causes a compiler "loop"
1775 */
1776
1777 blk_size = udf_vfsp->udf_lbsize;
1778 if ((offset + blk_size) > ip->i_size) {
1779 count = ip->i_size - offset;
1780 } else {
1781 count = blk_size;
1782 }
1783 error = ud_bmap_write(ip, offset, count, 0, cr);
1784 if (error) {
1785 goto update_inode;
1786 }
1787 offset += count; /* XXX - make this contig */
1788 }
1789 }
1790
1791 /*
1792 * Can be a reader from now on.
1793 */
1794 #ifdef __lock_lint
1795 if (rwtype == RW_WRITER) {
1796 rw_downgrade(&ip->i_contents);
1797 }
1798 #else
1799 if (dolock && rwtype == RW_WRITER) {
1800 rw_downgrade(&ip->i_contents);
1801 }
1802 #endif
1803
1804 /*
1805 * We remove PROT_WRITE in cases when the file has UDF holes
1806 * because we don't want to call bmap_read() to check each
1807 * page if it is backed with a disk block.
1808 */
1809 if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1810 *protp &= ~PROT_WRITE;
1811 }
1812
1813 error = 0;
1814
1815 /*
1816 * The loop looks up pages in the range <off, off + len).
1817 * For each page, we first check if we should initiate an asynchronous
1818 * read ahead before we call page_lookup (we may sleep in page_lookup
1819 * for a previously initiated disk read).
1820 */
1821 eoff = (uoff + len);
1822 for (pgoff = uoff, pgaddr = addr, pl = plarr;
1823 pgoff < eoff; /* empty */) {
1824 page_t *pp;
1825 u_offset_t nextrio;
1826 se_t se;
1827
1828 se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1829
1830 /*
1831 * Handle async getpage (faultahead)
1832 */
1833 if (plarr == NULL) {
1834 ip->i_nextrio = pgoff;
1835 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1836 pgoff += pgsize;
1837 pgaddr += pgsize;
1838 continue;
1839 }
1840
1841 /*
1842 * Check if we should initiate read ahead of next cluster.
1843 * We call page_exists only when we need to confirm that
1844 * we have the current page before we initiate the read ahead.
1845 */
1846 nextrio = ip->i_nextrio;
1847 if (seqmode &&
1848 pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1849 nextrio < ip->i_size && page_exists(vp, pgoff))
1850 ud_getpage_ra(vp, pgoff, seg, pgaddr);
1851
1852 if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1853
1854 /*
1855 * We found the page in the page cache.
1856 */
1857 *pl++ = pp;
1858 pgoff += pgsize;
1859 pgaddr += pgsize;
1860 len -= pgsize;
1861 plsz -= pgsize;
1862 } else {
1863
1864 /*
1865 * We have to create the page, or read it from disk.
1866 */
1867 if (error = ud_getpage_miss(vp, pgoff, len,
1868 seg, pgaddr, pl, plsz, rw, seqmode)) {
1869 goto error_out;
1870 }
1871
1872 while (*pl != NULL) {
1873 pl++;
1874 pgoff += pgsize;
1875 pgaddr += pgsize;
1876 len -= pgsize;
1877 plsz -= pgsize;
1878 }
1879 }
1880 }
1881
1882 /*
1883 * Return pages up to plsz if they are in the page cache.
1884 * We cannot return pages if there is a chance that they are
1885 * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1886 */
1887 if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1888
1889 ASSERT((protp == NULL) ||
1890 !(has_holes && (*protp & PROT_WRITE)));
1891
1892 eoff = pgoff + plsz;
1893 while (pgoff < eoff) {
1894 page_t *pp;
1895
1896 if ((pp = page_lookup_nowait(vp, pgoff,
1897 SE_SHARED)) == NULL)
1898 break;
1899
1900 *pl++ = pp;
1901 pgoff += pgsize;
1902 plsz -= pgsize;
1903 }
1904 }
1905
1906 if (plarr)
1907 *pl = NULL; /* Terminate page list */
1908 ip->i_nextr = pgoff;
1909
1910 error_out:
1911 if (error && plarr) {
1912 /*
1913 * Release any pages we have locked.
1914 */
1915 while (pl > &plarr[0])
1916 page_unlock(*--pl);
1917
1918 plarr[0] = NULL;
1919 }
1920
1921 update_inode:
1922 #ifdef __lock_lint
1923 rw_exit(&ip->i_contents);
1924 #else
1925 if (dolock) {
1926 rw_exit(&ip->i_contents);
1927 }
1928 #endif
1929
1930 /*
1931 * If the inode is not already marked for IACC (in rwip() for read)
1932 * and the inode is not marked for no access time update (in rwip()
1933 * for write) then update the inode access time and mod time now.
1934 */
1935 mutex_enter(&ip->i_tlock);
1936 if ((ip->i_flag & (IACC | INOACC)) == 0) {
1937 if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1938 ip->i_flag |= IACC;
1939 }
1940 if (rw == S_WRITE) {
1941 ip->i_flag |= IUPD;
1942 }
1943 ITIMES_NOLOCK(ip);
1944 }
1945 mutex_exit(&ip->i_tlock);
1946
1947 return (error);
1948 }
1949
1950 int32_t ud_delay = 1;
1951
1952 /* ARGSUSED */
1953 static int32_t
1954 udf_putpage(
1955 struct vnode *vp,
1956 offset_t off,
1957 size_t len,
1958 int32_t flags,
1959 struct cred *cr,
1960 caller_context_t *ct)
1961 {
1962 struct ud_inode *ip;
1963 int32_t error = 0;
1964
1965 ud_printf("udf_putpage\n");
1966
1967 ip = VTOI(vp);
1968 #ifdef __lock_lint
1969 rw_enter(&ip->i_contents, RW_WRITER);
1970 #endif
1971
1972 if (vp->v_count == 0) {
1973 cmn_err(CE_WARN, "ud_putpage : bad v_count");
1974 error = EINVAL;
1975 goto out;
1976 }
1977
1978 if (vp->v_flag & VNOMAP) {
1979 error = ENOSYS;
1980 goto out;
1981 }
1982
1983 if (flags & B_ASYNC) {
1984 if (ud_delay && len &&
1985 (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1986 mutex_enter(&ip->i_tlock);
1987
1988 /*
1989 * If nobody stalled, start a new cluster.
1990 */
1991 if (ip->i_delaylen == 0) {
1992 ip->i_delayoff = off;
1993 ip->i_delaylen = len;
1994 mutex_exit(&ip->i_tlock);
1995 goto out;
1996 }
1997
1998 /*
1999 * If we have a full cluster or they are not contig,
2000 * then push last cluster and start over.
2001 */
2002 if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
2003 ip->i_delayoff + ip->i_delaylen != off) {
2004 u_offset_t doff;
2005 size_t dlen;
2006
2007 doff = ip->i_delayoff;
2008 dlen = ip->i_delaylen;
2009 ip->i_delayoff = off;
2010 ip->i_delaylen = len;
2011 mutex_exit(&ip->i_tlock);
2012 error = ud_putpages(vp, doff, dlen, flags, cr);
2013 /* LMXXX - flags are new val, not old */
2014 goto out;
2015 }
2016
2017 /*
2018 * There is something there, it's not full, and
2019 * it is contig.
2020 */
2021 ip->i_delaylen += len;
2022 mutex_exit(&ip->i_tlock);
2023 goto out;
2024 }
2025
2026 /*
2027 * Must have weird flags or we are not clustering.
2028 */
2029 }
2030
2031 error = ud_putpages(vp, off, len, flags, cr);
2032
2033 out:
2034 #ifdef __lock_lint
2035 rw_exit(&ip->i_contents);
2036 #endif
2037 return (error);
2038 }
2039
2040 /* ARGSUSED */
2041 static int32_t
2042 udf_map(
2043 struct vnode *vp,
2044 offset_t off,
2045 struct as *as,
2046 caddr_t *addrp,
2047 size_t len,
2048 uint8_t prot,
2049 uint8_t maxprot,
2050 uint32_t flags,
2051 struct cred *cr,
2052 caller_context_t *ct)
2053 {
2054 struct segvn_crargs vn_a;
2055 int32_t error = 0;
2056
2057 ud_printf("udf_map\n");
2058
2059 if (vp->v_flag & VNOMAP) {
2060 error = ENOSYS;
2061 goto end;
2062 }
2063
2064 if ((off < (offset_t)0) ||
2065 ((off + len) < (offset_t)0)) {
2066 error = EINVAL;
2067 goto end;
2068 }
2069
2070 if (vp->v_type != VREG) {
2071 error = ENODEV;
2072 goto end;
2073 }
2074
2075 /*
2076 * If file is being locked, disallow mapping.
2077 */
2078 if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2079 error = EAGAIN;
2080 goto end;
2081 }
2082
2083 as_rangelock(as);
2084 error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2085 if (error != 0) {
2086 as_rangeunlock(as);
2087 goto end;
2088 }
2089
2090 vn_a.vp = vp;
2091 vn_a.offset = off;
2092 vn_a.type = flags & MAP_TYPE;
2093 vn_a.prot = prot;
2094 vn_a.maxprot = maxprot;
2095 vn_a.cred = cr;
2096 vn_a.amp = NULL;
2097 vn_a.flags = flags & ~MAP_TYPE;
2098 vn_a.szc = 0;
2099 vn_a.lgrp_mem_policy_flags = 0;
2100
2101 error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2102 as_rangeunlock(as);
2103
2104 end:
2105 return (error);
2106 }
2107
2108 /* ARGSUSED */
2109 static int32_t
2110 udf_addmap(struct vnode *vp,
2111 offset_t off,
2112 struct as *as,
2113 caddr_t addr,
2114 size_t len,
2115 uint8_t prot,
2116 uint8_t maxprot,
2117 uint32_t flags,
2118 struct cred *cr,
2119 caller_context_t *ct)
2120 {
2121 struct ud_inode *ip = VTOI(vp);
2122
2123 ud_printf("udf_addmap\n");
2124
2125 if (vp->v_flag & VNOMAP) {
2126 return (ENOSYS);
2127 }
2128
2129 mutex_enter(&ip->i_tlock);
2130 ip->i_mapcnt += btopr(len);
2131 mutex_exit(&ip->i_tlock);
2132
2133 return (0);
2134 }
2135
2136 /* ARGSUSED */
2137 static int32_t
2138 udf_delmap(
2139 struct vnode *vp, offset_t off,
2140 struct as *as,
2141 caddr_t addr,
2142 size_t len,
2143 uint32_t prot,
2144 uint32_t maxprot,
2145 uint32_t flags,
2146 struct cred *cr,
2147 caller_context_t *ct)
2148 {
2149 struct ud_inode *ip = VTOI(vp);
2150
2151 ud_printf("udf_delmap\n");
2152
2153 if (vp->v_flag & VNOMAP) {
2154 return (ENOSYS);
2155 }
2156
2157 mutex_enter(&ip->i_tlock);
2158 ip->i_mapcnt -= btopr(len); /* Count released mappings */
2159 ASSERT(ip->i_mapcnt >= 0);
2160 mutex_exit(&ip->i_tlock);
2161
2162 return (0);
2163 }
2164
2165 /* ARGSUSED */
2166 static int32_t
2167 udf_l_pathconf(
2168 struct vnode *vp,
2169 int32_t cmd,
2170 ulong_t *valp,
2171 struct cred *cr,
2172 caller_context_t *ct)
2173 {
2174 int32_t error = 0;
2175
2176 ud_printf("udf_l_pathconf\n");
2177
2178 if (cmd == _PC_FILESIZEBITS) {
2179 /*
2180 * udf supports 64 bits as file size
2181 * but there are several other restrictions
2182 * it only supports 32-bit block numbers and
2183 * daddr32_t is only and int32_t so taking these
2184 * into account we can stay just as where ufs is
2185 */
2186 *valp = 41;
2187 } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2188 /* nanosecond timestamp resolution */
2189 *valp = 1L;
2190 } else {
2191 error = fs_pathconf(vp, cmd, valp, cr, ct);
2192 }
2193
2194 return (error);
2195 }
2196
2197 uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2198 #ifndef __lint
2199 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2200 _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2201 #endif
2202 /*
2203 * Assumption is that there will not be a pageio request
2204 * to a enbedded file
2205 */
2206 /* ARGSUSED */
2207 static int32_t
2208 udf_pageio(
2209 struct vnode *vp,
2210 struct page *pp,
2211 u_offset_t io_off,
2212 size_t io_len,
2213 int32_t flags,
2214 struct cred *cr,
2215 caller_context_t *ct)
2216 {
2217 daddr_t bn;
2218 struct buf *bp;
2219 struct ud_inode *ip = VTOI(vp);
2220 int32_t dolock, error = 0, contig, multi_io;
2221 size_t done_len = 0, cur_len = 0;
2222 page_t *npp = NULL, *opp = NULL, *cpp = pp;
2223
2224 if (pp == NULL) {
2225 return (EINVAL);
2226 }
2227
2228 dolock = (rw_owner(&ip->i_contents) != curthread);
2229
2230 /*
2231 * We need a better check. Ideally, we would use another
2232 * vnodeops so that hlocked and forcibly unmounted file
2233 * systems would return EIO where appropriate and w/o the
2234 * need for these checks.
2235 */
2236 if (ip->i_udf == NULL) {
2237 return (EIO);
2238 }
2239
2240 #ifdef __lock_lint
2241 rw_enter(&ip->i_contents, RW_READER);
2242 #else
2243 if (dolock) {
2244 rw_enter(&ip->i_contents, RW_READER);
2245 }
2246 #endif
2247
2248 /*
2249 * Break the io request into chunks, one for each contiguous
2250 * stretch of disk blocks in the target file.
2251 */
2252 while (done_len < io_len) {
2253 ASSERT(cpp);
2254 bp = NULL;
2255 contig = 0;
2256 if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2257 &bn, &contig)) {
2258 break;
2259 }
2260
2261 if (bn == UDF_HOLE) { /* No holey swapfiles */
2262 cmn_err(CE_WARN, "SWAP file has HOLES");
2263 error = EINVAL;
2264 break;
2265 }
2266
2267 cur_len = MIN(io_len - done_len, contig);
2268
2269 /*
2270 * Check if more than one I/O is
2271 * required to complete the given
2272 * I/O operation
2273 */
2274 if (ip->i_udf->udf_lbsize < PAGESIZE) {
2275 if (cur_len >= PAGESIZE) {
2276 multi_io = 0;
2277 cur_len &= PAGEMASK;
2278 } else {
2279 multi_io = 1;
2280 cur_len = MIN(io_len - done_len, PAGESIZE);
2281 }
2282 }
2283 page_list_break(&cpp, &npp, btop(cur_len));
2284
2285 bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2286 ASSERT(bp != NULL);
2287
2288 bp->b_edev = ip->i_dev;
2289 bp->b_dev = cmpdev(ip->i_dev);
2290 bp->b_blkno = bn;
2291 bp->b_un.b_addr = (caddr_t)0;
2292 bp->b_file = vp;
2293 bp->b_offset = (offset_t)(io_off + done_len);
2294
2295 /*
2296 * ub.ub_pageios.value.ul++;
2297 */
2298 if (multi_io == 0) {
2299 (void) bdev_strategy(bp);
2300 } else {
2301 error = ud_multi_strat(ip, cpp, bp,
2302 (u_offset_t)(io_off + done_len));
2303 if (error != 0) {
2304 pageio_done(bp);
2305 break;
2306 }
2307 }
2308 if (flags & B_READ) {
2309 ud_pageio_reads++;
2310 } else {
2311 ud_pageio_writes++;
2312 }
2313
2314 /*
2315 * If the request is not B_ASYNC, wait for i/o to complete
2316 * and re-assemble the page list to return to the caller.
2317 * If it is B_ASYNC we leave the page list in pieces and
2318 * cleanup() will dispose of them.
2319 */
2320 if ((flags & B_ASYNC) == 0) {
2321 error = biowait(bp);
2322 pageio_done(bp);
2323 if (error) {
2324 break;
2325 }
2326 page_list_concat(&opp, &cpp);
2327 }
2328 cpp = npp;
2329 npp = NULL;
2330 done_len += cur_len;
2331 }
2332
2333 ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2334 if (error) {
2335 if (flags & B_ASYNC) {
2336 /* Cleanup unprocessed parts of list */
2337 page_list_concat(&cpp, &npp);
2338 if (flags & B_READ) {
2339 pvn_read_done(cpp, B_ERROR);
2340 } else {
2341 pvn_write_done(cpp, B_ERROR);
2342 }
2343 } else {
2344 /* Re-assemble list and let caller clean up */
2345 page_list_concat(&opp, &cpp);
2346 page_list_concat(&opp, &npp);
2347 }
2348 }
2349
2350 #ifdef __lock_lint
2351 rw_exit(&ip->i_contents);
2352 #else
2353 if (dolock) {
2354 rw_exit(&ip->i_contents);
2355 }
2356 #endif
2357 return (error);
2358 }
2359
2360
2361
2362
2363 /* -------------------- local functions --------------------------- */
2364
2365
2366
2367 int32_t
2368 ud_rdwri(enum uio_rw rw, int32_t ioflag,
2369 struct ud_inode *ip, caddr_t base, int32_t len,
2370 offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2371 {
2372 int32_t error;
2373 struct uio auio;
2374 struct iovec aiov;
2375
2376 ud_printf("ud_rdwri\n");
2377
2378 bzero((caddr_t)&auio, sizeof (uio_t));
2379 bzero((caddr_t)&aiov, sizeof (iovec_t));
2380
2381 aiov.iov_base = base;
2382 aiov.iov_len = len;
2383 auio.uio_iov = &aiov;
2384 auio.uio_iovcnt = 1;
2385 auio.uio_loffset = offset;
2386 auio.uio_segflg = (int16_t)seg;
2387 auio.uio_resid = len;
2388
2389 if (rw == UIO_WRITE) {
2390 auio.uio_fmode = FWRITE;
2391 auio.uio_extflg = UIO_COPY_DEFAULT;
2392 auio.uio_llimit = curproc->p_fsz_ctl;
2393 error = ud_wrip(ip, &auio, ioflag, cr);
2394 } else {
2395 auio.uio_fmode = FREAD;
2396 auio.uio_extflg = UIO_COPY_CACHED;
2397 auio.uio_llimit = MAXOFFSET_T;
2398 error = ud_rdip(ip, &auio, ioflag, cr);
2399 }
2400
2401 if (aresid) {
2402 *aresid = auio.uio_resid;
2403 } else if (auio.uio_resid) {
2404 error = EIO;
2405 }
2406 return (error);
2407 }
2408
2409 /*
2410 * Free behind hacks. The pager is busted.
2411 * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2412 * or B_FREE_IF_TIGHT_ON_MEMORY.
2413 */
2414 int32_t ud_freebehind = 1;
2415 int32_t ud_smallfile = 32 * 1024;
2416
2417 /* ARGSUSED */
2418 int32_t
2419 ud_getpage_miss(struct vnode *vp, u_offset_t off,
2420 size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2421 size_t plsz, enum seg_rw rw, int32_t seq)
2422 {
2423 struct ud_inode *ip = VTOI(vp);
2424 int32_t err = 0;
2425 size_t io_len;
2426 u_offset_t io_off;
2427 u_offset_t pgoff;
2428 page_t *pp;
2429
2430 pl[0] = NULL;
2431
2432 /*
2433 * Figure out whether the page can be created, or must be
2434 * read from the disk
2435 */
2436 if (rw == S_CREATE) {
2437 if ((pp = page_create_va(vp, off,
2438 PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2439 cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2440 return (EINVAL);
2441 }
2442 io_len = PAGESIZE;
2443 } else {
2444 pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2445 &io_len, off, PAGESIZE, 0);
2446
2447 /*
2448 * Some other thread has entered the page.
2449 * ud_getpage will retry page_lookup.
2450 */
2451 if (pp == NULL) {
2452 return (0);
2453 }
2454
2455 /*
2456 * Fill the page with as much data as we can from the file.
2457 */
2458 err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2459 if (err) {
2460 pvn_read_done(pp, B_ERROR);
2461 return (err);
2462 }
2463
2464 /*
2465 * XXX ??? ufs has io_len instead of pgoff below
2466 */
2467 ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2468
2469 /*
2470 * If the file access is sequential, initiate read ahead
2471 * of the next cluster.
2472 */
2473 if (seq && ip->i_nextrio < ip->i_size) {
2474 ud_getpage_ra(vp, off, seg, addr);
2475 }
2476 }
2477
2478 outmiss:
2479 pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2480 return (err);
2481 }
2482
2483 /* ARGSUSED */
2484 void
2485 ud_getpage_ra(struct vnode *vp,
2486 u_offset_t off, struct seg *seg, caddr_t addr)
2487 {
2488 page_t *pp;
2489 size_t io_len;
2490 struct ud_inode *ip = VTOI(vp);
2491 u_offset_t io_off = ip->i_nextrio, pgoff;
2492 caddr_t addr2 = addr + (io_off - off);
2493 daddr_t bn;
2494 int32_t contig = 0;
2495
2496 /*
2497 * Is this test needed?
2498 */
2499
2500 if (addr2 >= seg->s_base + seg->s_size) {
2501 return;
2502 }
2503
2504 contig = 0;
2505 if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2506 return;
2507 }
2508
2509 pp = pvn_read_kluster(vp, io_off, seg, addr2,
2510 &io_off, &io_len, io_off, PAGESIZE, 1);
2511
2512 /*
2513 * Some other thread has entered the page.
2514 * So no read head done here (ie we will have to and wait
2515 * for the read when needed).
2516 */
2517
2518 if (pp == NULL) {
2519 return;
2520 }
2521
2522 (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2523 ip->i_nextrio = io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2524 }
2525
2526 int
2527 ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2528 uint32_t bflgs, u_offset_t *pg_off)
2529 {
2530 daddr_t bn;
2531 struct buf *bp;
2532 caddr_t kaddr, caddr;
2533 int32_t error = 0, contig = 0, multi_io = 0;
2534 int32_t lbsize = ip->i_udf->udf_lbsize;
2535 int32_t lbmask = ip->i_udf->udf_lbmask;
2536 uint64_t isize;
2537
2538 isize = (ip->i_size + lbmask) & (~lbmask);
2539 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2540
2541 /*
2542 * Embedded file read file_entry
2543 * from buffer cache and copy the required
2544 * portions
2545 */
2546 bp = ud_bread(ip->i_dev,
2547 ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2548 if ((bp->b_error == 0) &&
2549 (bp->b_resid == 0)) {
2550
2551 caddr = bp->b_un.b_addr + ip->i_data_off;
2552
2553 /*
2554 * mapin to kvm
2555 */
2556 kaddr = (caddr_t)ppmapin(pp,
2557 PROT_READ | PROT_WRITE, (caddr_t)-1);
2558 (void) kcopy(caddr, kaddr, ip->i_size);
2559
2560 /*
2561 * mapout of kvm
2562 */
2563 ppmapout(kaddr);
2564 }
2565 brelse(bp);
2566 contig = ip->i_size;
2567 } else {
2568
2569 /*
2570 * Get the continuous size and block number
2571 * at offset "off"
2572 */
2573 if (error = ud_bmap_read(ip, off, &bn, &contig))
2574 goto out;
2575 contig = MIN(contig, PAGESIZE);
2576 contig = (contig + lbmask) & (~lbmask);
2577
2578 /*
2579 * Zero part of the page which we are not
2580 * going to read from the disk.
2581 */
2582
2583 if (bn == UDF_HOLE) {
2584
2585 /*
2586 * This is a HOLE. Just zero out
2587 * the page
2588 */
2589 if (((off + contig) == isize) ||
2590 (contig == PAGESIZE)) {
2591 pagezero(pp->p_prev, 0, PAGESIZE);
2592 goto out;
2593 }
2594 }
2595
2596 if (contig < PAGESIZE) {
2597 uint64_t count;
2598
2599 count = isize - off;
2600 if (contig != count) {
2601 multi_io = 1;
2602 contig = (int32_t)(MIN(count, PAGESIZE));
2603 } else {
2604 pagezero(pp->p_prev, contig, PAGESIZE - contig);
2605 }
2606 }
2607
2608 /*
2609 * Get a bp and initialize it
2610 */
2611 bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2612 ASSERT(bp != NULL);
2613
2614 bp->b_edev = ip->i_dev;
2615 bp->b_dev = cmpdev(ip->i_dev);
2616 bp->b_blkno = bn;
2617 bp->b_un.b_addr = 0;
2618 bp->b_file = ip->i_vnode;
2619
2620 /*
2621 * Start I/O
2622 */
2623 if (multi_io == 0) {
2624
2625 /*
2626 * Single I/O is sufficient for this page
2627 */
2628 (void) bdev_strategy(bp);
2629 } else {
2630
2631 /*
2632 * We need to do the I/O in
2633 * piece's
2634 */
2635 error = ud_multi_strat(ip, pp, bp, off);
2636 if (error != 0) {
2637 goto out;
2638 }
2639 }
2640 if ((bflgs & B_ASYNC) == 0) {
2641
2642 /*
2643 * Wait for i/o to complete.
2644 */
2645
2646 error = biowait(bp);
2647 pageio_done(bp);
2648 if (error) {
2649 goto out;
2650 }
2651 }
2652 }
2653 if ((off + contig) >= ip->i_size) {
2654 contig = ip->i_size - off;
2655 }
2656
2657 out:
2658 *pg_off = contig;
2659 return (error);
2660 }
2661
2662 int32_t
2663 ud_putpages(struct vnode *vp, offset_t off,
2664 size_t len, int32_t flags, struct cred *cr)
2665 {
2666 struct ud_inode *ip;
2667 page_t *pp;
2668 u_offset_t io_off;
2669 size_t io_len;
2670 u_offset_t eoff;
2671 int32_t err = 0;
2672 int32_t dolock;
2673
2674 ud_printf("ud_putpages\n");
2675
2676 if (vp->v_count == 0) {
2677 cmn_err(CE_WARN, "ud_putpages: bad v_count");
2678 return (EINVAL);
2679 }
2680
2681 ip = VTOI(vp);
2682
2683 /*
2684 * Acquire the readers/write inode lock before locking
2685 * any pages in this inode.
2686 * The inode lock is held during i/o.
2687 */
2688 if (len == 0) {
2689 mutex_enter(&ip->i_tlock);
2690 ip->i_delayoff = ip->i_delaylen = 0;
2691 mutex_exit(&ip->i_tlock);
2692 }
2693 #ifdef __lock_lint
2694 rw_enter(&ip->i_contents, RW_READER);
2695 #else
2696 dolock = (rw_owner(&ip->i_contents) != curthread);
2697 if (dolock) {
2698 rw_enter(&ip->i_contents, RW_READER);
2699 }
2700 #endif
2701
2702 if (!vn_has_cached_data(vp)) {
2703 #ifdef __lock_lint
2704 rw_exit(&ip->i_contents);
2705 #else
2706 if (dolock) {
2707 rw_exit(&ip->i_contents);
2708 }
2709 #endif
2710 return (0);
2711 }
2712
2713 if (len == 0) {
2714 /*
2715 * Search the entire vp list for pages >= off.
2716 */
2717 err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2718 flags, cr);
2719 } else {
2720 /*
2721 * Loop over all offsets in the range looking for
2722 * pages to deal with.
2723 */
2724 if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2725 eoff = MIN(off + len, eoff);
2726 } else {
2727 eoff = off + len;
2728 }
2729
2730 for (io_off = off; io_off < eoff; io_off += io_len) {
2731 /*
2732 * If we are not invalidating, synchronously
2733 * freeing or writing pages, use the routine
2734 * page_lookup_nowait() to prevent reclaiming
2735 * them from the free list.
2736 */
2737 if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2738 pp = page_lookup(vp, io_off,
2739 (flags & (B_INVAL | B_FREE)) ?
2740 SE_EXCL : SE_SHARED);
2741 } else {
2742 pp = page_lookup_nowait(vp, io_off,
2743 (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2744 }
2745
2746 if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2747 io_len = PAGESIZE;
2748 } else {
2749
2750 err = ud_putapage(vp, pp,
2751 &io_off, &io_len, flags, cr);
2752 if (err != 0) {
2753 break;
2754 }
2755 /*
2756 * "io_off" and "io_len" are returned as
2757 * the range of pages we actually wrote.
2758 * This allows us to skip ahead more quickly
2759 * since several pages may've been dealt
2760 * with by this iteration of the loop.
2761 */
2762 }
2763 }
2764 }
2765 if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2766 /*
2767 * We have just sync'ed back all the pages on
2768 * the inode, turn off the IMODTIME flag.
2769 */
2770 mutex_enter(&ip->i_tlock);
2771 ip->i_flag &= ~IMODTIME;
2772 mutex_exit(&ip->i_tlock);
2773 }
2774 #ifdef __lock_lint
2775 rw_exit(&ip->i_contents);
2776 #else
2777 if (dolock) {
2778 rw_exit(&ip->i_contents);
2779 }
2780 #endif
2781 return (err);
2782 }
2783
2784 /* ARGSUSED */
2785 int32_t
2786 ud_putapage(struct vnode *vp,
2787 page_t *pp, u_offset_t *offp,
2788 size_t *lenp, int32_t flags, struct cred *cr)
2789 {
2790 daddr_t bn;
2791 size_t io_len;
2792 struct ud_inode *ip;
2793 int32_t error = 0, contig, multi_io = 0;
2794 struct udf_vfs *udf_vfsp;
2795 u_offset_t off, io_off;
2796 caddr_t kaddr, caddr;
2797 struct buf *bp = NULL;
2798 int32_t lbmask;
2799 uint64_t isize;
2800 uint16_t crc_len;
2801 struct file_entry *fe;
2802
2803 ud_printf("ud_putapage\n");
2804
2805 ip = VTOI(vp);
2806 ASSERT(ip);
2807 ASSERT(RW_LOCK_HELD(&ip->i_contents));
2808 lbmask = ip->i_udf->udf_lbmask;
2809 isize = (ip->i_size + lbmask) & (~lbmask);
2810
2811 udf_vfsp = ip->i_udf;
2812 ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2813
2814 /*
2815 * If the modified time on the inode has not already been
2816 * set elsewhere (e.g. for write/setattr) we set the time now.
2817 * This gives us approximate modified times for mmap'ed files
2818 * which are modified via stores in the user address space.
2819 */
2820 if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2821 mutex_enter(&ip->i_tlock);
2822 ip->i_flag |= IUPD;
2823 ITIMES_NOLOCK(ip);
2824 mutex_exit(&ip->i_tlock);
2825 }
2826
2827
2828 /*
2829 * Align the request to a block boundry (for old file systems),
2830 * and go ask bmap() how contiguous things are for this file.
2831 */
2832 off = pp->p_offset & ~(offset_t)lbmask;
2833 /* block align it */
2834
2835
2836 if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2837 ASSERT(ip->i_size <= ip->i_max_emb);
2838
2839 pp = pvn_write_kluster(vp, pp, &io_off,
2840 &io_len, off, PAGESIZE, flags);
2841 if (io_len == 0) {
2842 io_len = PAGESIZE;
2843 }
2844
2845 bp = ud_bread(ip->i_dev,
2846 ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2847 udf_vfsp->udf_lbsize);
2848 fe = (struct file_entry *)bp->b_un.b_addr;
2849 if ((bp->b_flags & B_ERROR) ||
2850 (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2851 ip->i_icb_block,
2852 1, udf_vfsp->udf_lbsize) != 0)) {
2853 if (pp != NULL)
2854 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2855 if (bp->b_flags & B_ERROR) {
2856 error = EIO;
2857 } else {
2858 error = EINVAL;
2859 }
2860 brelse(bp);
2861 return (error);
2862 }
2863 if ((bp->b_error == 0) &&
2864 (bp->b_resid == 0)) {
2865
2866 caddr = bp->b_un.b_addr + ip->i_data_off;
2867 kaddr = (caddr_t)ppmapin(pp,
2868 PROT_READ | PROT_WRITE, (caddr_t)-1);
2869 (void) kcopy(kaddr, caddr, ip->i_size);
2870 ppmapout(kaddr);
2871 }
2872 crc_len = offsetof(struct file_entry, fe_spec) +
2873 SWAP_32(fe->fe_len_ear);
2874 crc_len += ip->i_size;
2875 ud_make_tag(ip->i_udf, &fe->fe_tag,
2876 UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2877
2878 bwrite(bp);
2879
2880 if (flags & B_ASYNC) {
2881 pvn_write_done(pp, flags);
2882 }
2883 contig = ip->i_size;
2884 } else {
2885
2886 if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2887 goto out;
2888 }
2889 contig = MIN(contig, PAGESIZE);
2890 contig = (contig + lbmask) & (~lbmask);
2891
2892 if (contig < PAGESIZE) {
2893 uint64_t count;
2894
2895 count = isize - off;
2896 if (contig != count) {
2897 multi_io = 1;
2898 contig = (int32_t)(MIN(count, PAGESIZE));
2899 }
2900 }
2901
2902 if ((off + contig) > isize) {
2903 contig = isize - off;
2904 }
2905
2906 if (contig > PAGESIZE) {
2907 if (contig & PAGEOFFSET) {
2908 contig &= PAGEMASK;
2909 }
2910 }
2911
2912 pp = pvn_write_kluster(vp, pp, &io_off,
2913 &io_len, off, contig, flags);
2914 if (io_len == 0) {
2915 io_len = PAGESIZE;
2916 }
2917
2918 bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2919 ASSERT(bp != NULL);
2920
2921 bp->b_edev = ip->i_dev;
2922 bp->b_dev = cmpdev(ip->i_dev);
2923 bp->b_blkno = bn;
2924 bp->b_un.b_addr = 0;
2925 bp->b_file = vp;
2926 bp->b_offset = (offset_t)off;
2927
2928
2929 /*
2930 * write throttle
2931 */
2932 ASSERT(bp->b_iodone == NULL);
2933 bp->b_iodone = ud_iodone;
2934 mutex_enter(&ip->i_tlock);
2935 ip->i_writes += bp->b_bcount;
2936 mutex_exit(&ip->i_tlock);
2937
2938 if (multi_io == 0) {
2939
2940 (void) bdev_strategy(bp);
2941 } else {
2942 error = ud_multi_strat(ip, pp, bp, off);
2943 if (error != 0) {
2944 goto out;
2945 }
2946 }
2947
2948 if ((flags & B_ASYNC) == 0) {
2949 /*
2950 * Wait for i/o to complete.
2951 */
2952 error = biowait(bp);
2953 pageio_done(bp);
2954 }
2955 }
2956
2957 if ((flags & B_ASYNC) == 0) {
2958 pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2959 }
2960
2961 pp = NULL;
2962
2963 out:
2964 if (error != 0 && pp != NULL) {
2965 pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2966 }
2967
2968 if (offp) {
2969 *offp = io_off;
2970 }
2971 if (lenp) {
2972 *lenp = io_len;
2973 }
2974
2975 return (error);
2976 }
2977
2978
2979 int32_t
2980 ud_iodone(struct buf *bp)
2981 {
2982 struct ud_inode *ip;
2983
2984 ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2985
2986 bp->b_iodone = NULL;
2987
2988 ip = VTOI(bp->b_pages->p_vnode);
2989
2990 mutex_enter(&ip->i_tlock);
2991 if (ip->i_writes >= ud_LW) {
2992 if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2993 if (ud_WRITES) {
2994 cv_broadcast(&ip->i_wrcv); /* wake all up */
2995 }
2996 }
2997 } else {
2998 ip->i_writes -= bp->b_bcount;
2999 }
3000 mutex_exit(&ip->i_tlock);
3001 iodone(bp);
3002 return (0);
3003 }
3004
3005 /* ARGSUSED3 */
3006 int32_t
3007 ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
3008 {
3009 struct vnode *vp;
3010 struct udf_vfs *udf_vfsp;
3011 krw_t rwtype;
3012 caddr_t base;
3013 uint32_t flags;
3014 int32_t error, n, on, mapon, dofree;
3015 u_offset_t off;
3016 long oresid = uio->uio_resid;
3017
3018 ASSERT(RW_LOCK_HELD(&ip->i_contents));
3019 if ((ip->i_type != VREG) &&
3020 (ip->i_type != VDIR) &&
3021 (ip->i_type != VLNK)) {
3022 return (EIO);
3023 }
3024
3025 if (uio->uio_loffset > MAXOFFSET_T) {
3026 return (0);
3027 }
3028
3029 if ((uio->uio_loffset < (offset_t)0) ||
3030 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3031 return (EINVAL);
3032 }
3033 if (uio->uio_resid == 0) {
3034 return (0);
3035 }
3036
3037 vp = ITOV(ip);
3038 udf_vfsp = ip->i_udf;
3039 mutex_enter(&ip->i_tlock);
3040 ip->i_flag |= IACC;
3041 mutex_exit(&ip->i_tlock);
3042
3043 rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3044
3045 do {
3046 offset_t diff;
3047 u_offset_t uoff = uio->uio_loffset;
3048 off = uoff & (offset_t)MAXBMASK;
3049 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3050 on = (int)blkoff(udf_vfsp, uoff);
3051 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3052
3053 diff = ip->i_size - uoff;
3054
3055 if (diff <= (offset_t)0) {
3056 error = 0;
3057 goto out;
3058 }
3059 if (diff < (offset_t)n) {
3060 n = (int)diff;
3061 }
3062 dofree = ud_freebehind &&
3063 ip->i_nextr == (off & PAGEMASK) &&
3064 off > ud_smallfile;
3065
3066 #ifndef __lock_lint
3067 if (rwtype == RW_READER) {
3068 rw_exit(&ip->i_contents);
3069 }
3070 #endif
3071
3072 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3073 (uint32_t)n, 1, S_READ);
3074 error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3075
3076 flags = 0;
3077 if (!error) {
3078 /*
3079 * If read a whole block, or read to eof,
3080 * won't need this buffer again soon.
3081 */
3082 if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3083 freemem < lotsfree + pages_before_pager) {
3084 flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3085 }
3086 /*
3087 * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3088 * we want to make sure that the page which has
3089 * been read, is written on disk if it is dirty.
3090 * And corresponding indirect blocks should also
3091 * be flushed out.
3092 */
3093 if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3094 flags &= ~SM_ASYNC;
3095 flags |= SM_WRITE;
3096 }
3097 error = segmap_release(segkmap, base, flags);
3098 } else {
3099 (void) segmap_release(segkmap, base, flags);
3100 }
3101
3102 #ifndef __lock_lint
3103 if (rwtype == RW_READER) {
3104 rw_enter(&ip->i_contents, rwtype);
3105 }
3106 #endif
3107 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3108 out:
3109 /*
3110 * Inode is updated according to this table if FRSYNC is set.
3111 *
3112 * FSYNC FDSYNC(posix.4)
3113 * --------------------------
3114 * always IATTCHG|IBDWRITE
3115 */
3116 if (ioflag & FRSYNC) {
3117 if ((ioflag & FSYNC) ||
3118 ((ioflag & FDSYNC) &&
3119 (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3120 rw_exit(&ip->i_contents);
3121 rw_enter(&ip->i_contents, RW_WRITER);
3122 ud_iupdat(ip, 1);
3123 }
3124 }
3125 /*
3126 * If we've already done a partial read, terminate
3127 * the read but return no error.
3128 */
3129 if (oresid != uio->uio_resid) {
3130 error = 0;
3131 }
3132 ITIMES(ip);
3133
3134 return (error);
3135 }
3136
3137 int32_t
3138 ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3139 {
3140 caddr_t base;
3141 struct vnode *vp;
3142 struct udf_vfs *udf_vfsp;
3143 uint32_t flags;
3144 int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3145 int32_t pagecreate, newpage;
3146 uint64_t old_i_size;
3147 u_offset_t off;
3148 long start_resid = uio->uio_resid, premove_resid;
3149 rlim64_t limit = uio->uio_limit;
3150
3151
3152 ASSERT(RW_WRITE_HELD(&ip->i_contents));
3153 if ((ip->i_type != VREG) &&
3154 (ip->i_type != VDIR) &&
3155 (ip->i_type != VLNK)) {
3156 return (EIO);
3157 }
3158
3159 if (uio->uio_loffset >= MAXOFFSET_T) {
3160 return (EFBIG);
3161 }
3162 /*
3163 * see udf_l_pathconf
3164 */
3165 if (limit > (((uint64_t)1 << 40) - 1)) {
3166 limit = ((uint64_t)1 << 40) - 1;
3167 }
3168 if (uio->uio_loffset >= limit) {
3169 proc_t *p = ttoproc(curthread);
3170
3171 mutex_enter(&p->p_lock);
3172 (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3173 p, RCA_UNSAFE_SIGINFO);
3174 mutex_exit(&p->p_lock);
3175 return (EFBIG);
3176 }
3177 if ((uio->uio_loffset < (offset_t)0) ||
3178 ((uio->uio_loffset + uio->uio_resid) < 0)) {
3179 return (EINVAL);
3180 }
3181 if (uio->uio_resid == 0) {
3182 return (0);
3183 }
3184
3185 mutex_enter(&ip->i_tlock);
3186 ip->i_flag |= INOACC;
3187
3188 if (ioflag & (FSYNC | FDSYNC)) {
3189 ip->i_flag |= ISYNC;
3190 iupdat_flag = 1;
3191 }
3192 mutex_exit(&ip->i_tlock);
3193
3194 udf_vfsp = ip->i_udf;
3195 vp = ITOV(ip);
3196
3197 do {
3198 u_offset_t uoff = uio->uio_loffset;
3199 off = uoff & (offset_t)MAXBMASK;
3200 mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3201 on = (int)blkoff(udf_vfsp, uoff);
3202 n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3203
3204 if (ip->i_type == VREG && uoff + n >= limit) {
3205 if (uoff >= limit) {
3206 error = EFBIG;
3207 goto out;
3208 }
3209 n = (int)(limit - (rlim64_t)uoff);
3210 }
3211 if (uoff + n > ip->i_size) {
3212 /*
3213 * We are extending the length of the file.
3214 * bmap is used so that we are sure that
3215 * if we need to allocate new blocks, that it
3216 * is done here before we up the file size.
3217 */
3218 error = ud_bmap_write(ip, uoff,
3219 (int)(on + n), mapon == 0, cr);
3220 if (error) {
3221 break;
3222 }
3223 i_size_changed = 1;
3224 old_i_size = ip->i_size;
3225 ip->i_size = uoff + n;
3226 /*
3227 * If we are writing from the beginning of
3228 * the mapping, we can just create the
3229 * pages without having to read them.
3230 */
3231 pagecreate = (mapon == 0);
3232 } else if (n == MAXBSIZE) {
3233 /*
3234 * Going to do a whole mappings worth,
3235 * so we can just create the pages w/o
3236 * having to read them in. But before
3237 * we do that, we need to make sure any
3238 * needed blocks are allocated first.
3239 */
3240 error = ud_bmap_write(ip, uoff,
3241 (int)(on + n), 1, cr);
3242 if (error) {
3243 break;
3244 }
3245 pagecreate = 1;
3246 } else {
3247 pagecreate = 0;
3248 }
3249
3250 rw_exit(&ip->i_contents);
3251
3252 /*
3253 * Touch the page and fault it in if it is not in
3254 * core before segmap_getmapflt can lock it. This
3255 * is to avoid the deadlock if the buffer is mapped
3256 * to the same file through mmap which we want to
3257 * write to.
3258 */
3259 uio_prefaultpages((long)n, uio);
3260
3261 base = segmap_getmapflt(segkmap, vp, (off + mapon),
3262 (uint32_t)n, !pagecreate, S_WRITE);
3263
3264 /*
3265 * segmap_pagecreate() returns 1 if it calls
3266 * page_create_va() to allocate any pages.
3267 */
3268 newpage = 0;
3269 if (pagecreate) {
3270 newpage = segmap_pagecreate(segkmap, base,
3271 (size_t)n, 0);
3272 }
3273
3274 premove_resid = uio->uio_resid;
3275 error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3276
3277 if (pagecreate &&
3278 uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3279 /*
3280 * We created pages w/o initializing them completely,
3281 * thus we need to zero the part that wasn't set up.
3282 * This happens on most EOF write cases and if
3283 * we had some sort of error during the uiomove.
3284 */
3285 int nzero, nmoved;
3286
3287 nmoved = (int)(uio->uio_loffset - (off + mapon));
3288 ASSERT(nmoved >= 0 && nmoved <= n);
3289 nzero = roundup(on + n, PAGESIZE) - nmoved;
3290 ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3291 (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3292 }
3293
3294 /*
3295 * Unlock the pages allocated by page_create_va()
3296 * in segmap_pagecreate()
3297 */
3298 if (newpage) {
3299 segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3300 }
3301
3302 if (error) {
3303 /*
3304 * If we failed on a write, we may have already
3305 * allocated file blocks as well as pages. It's
3306 * hard to undo the block allocation, but we must
3307 * be sure to invalidate any pages that may have
3308 * been allocated.
3309 */
3310 (void) segmap_release(segkmap, base, SM_INVAL);
3311 } else {
3312 flags = 0;
3313 /*
3314 * Force write back for synchronous write cases.
3315 */
3316 if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3317 /*
3318 * If the sticky bit is set but the
3319 * execute bit is not set, we do a
3320 * synchronous write back and free
3321 * the page when done. We set up swap
3322 * files to be handled this way to
3323 * prevent servers from keeping around
3324 * the client's swap pages too long.
3325 * XXX - there ought to be a better way.
3326 */
3327 if (IS_SWAPVP(vp)) {
3328 flags = SM_WRITE | SM_FREE |
3329 SM_DONTNEED;
3330 iupdat_flag = 0;
3331 } else {
3332 flags = SM_WRITE;
3333 }
3334 } else if (((mapon + n) == MAXBSIZE) ||
3335 IS_SWAPVP(vp)) {
3336 /*
3337 * Have written a whole block.
3338 * Start an asynchronous write and
3339 * mark the buffer to indicate that
3340 * it won't be needed again soon.
3341 */
3342 flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3343 }
3344 error = segmap_release(segkmap, base, flags);
3345
3346 /*
3347 * If the operation failed and is synchronous,
3348 * then we need to unwind what uiomove() last
3349 * did so we can potentially return an error to
3350 * the caller. If this write operation was
3351 * done in two pieces and the first succeeded,
3352 * then we won't return an error for the second
3353 * piece that failed. However, we only want to
3354 * return a resid value that reflects what was
3355 * really done.
3356 *
3357 * Failures for non-synchronous operations can
3358 * be ignored since the page subsystem will
3359 * retry the operation until it succeeds or the
3360 * file system is unmounted.
3361 */
3362 if (error) {
3363 if ((ioflag & (FSYNC | FDSYNC)) ||
3364 ip->i_type == VDIR) {
3365 uio->uio_resid = premove_resid;
3366 } else {
3367 error = 0;
3368 }
3369 }
3370 }
3371
3372 /*
3373 * Re-acquire contents lock.
3374 */
3375 rw_enter(&ip->i_contents, RW_WRITER);
3376 /*
3377 * If the uiomove() failed or if a synchronous
3378 * page push failed, fix up i_size.
3379 */
3380 if (error) {
3381 if (i_size_changed) {
3382 /*
3383 * The uiomove failed, and we
3384 * allocated blocks,so get rid
3385 * of them.
3386 */
3387 (void) ud_itrunc(ip, old_i_size, 0, cr);
3388 }
3389 } else {
3390 /*
3391 * XXX - Can this be out of the loop?
3392 */
3393 ip->i_flag |= IUPD | ICHG;
3394 if (i_size_changed) {
3395 ip->i_flag |= IATTCHG;
3396 }
3397 if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3398 (IEXEC >> 10))) != 0 &&
3399 (ip->i_char & (ISUID | ISGID)) != 0 &&
3400 secpolicy_vnode_setid_retain(cr,
3401 (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3402 /*
3403 * Clear Set-UID & Set-GID bits on
3404 * successful write if not privileged
3405 * and at least one of the execute bits
3406 * is set. If we always clear Set-GID,
3407 * mandatory file and record locking is
3408 * unuseable.
3409 */
3410 ip->i_char &= ~(ISUID | ISGID);
3411 }
3412 }
3413 } while (error == 0 && uio->uio_resid > 0 && n != 0);
3414
3415 out:
3416 /*
3417 * Inode is updated according to this table -
3418 *
3419 * FSYNC FDSYNC(posix.4)
3420 * --------------------------
3421 * always@ IATTCHG|IBDWRITE
3422 *
3423 * @ - If we are doing synchronous write the only time we should
3424 * not be sync'ing the ip here is if we have the stickyhack
3425 * activated, the file is marked with the sticky bit and
3426 * no exec bit, the file length has not been changed and
3427 * no new blocks have been allocated during this write.
3428 */
3429 if ((ip->i_flag & ISYNC) != 0) {
3430 /*
3431 * we have eliminated nosync
3432 */
3433 if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3434 ((ioflag & FSYNC) && iupdat_flag)) {
3435 ud_iupdat(ip, 1);
3436 }
3437 }
3438
3439 /*
3440 * If we've already done a partial-write, terminate
3441 * the write but return no error.
3442 */
3443 if (start_resid != uio->uio_resid) {
3444 error = 0;
3445 }
3446 ip->i_flag &= ~(INOACC | ISYNC);
3447 ITIMES_NOLOCK(ip);
3448
3449 return (error);
3450 }
3451
3452 int32_t
3453 ud_multi_strat(struct ud_inode *ip,
3454 page_t *pp, struct buf *bp, u_offset_t start)
3455 {
3456 daddr_t bn;
3457 int32_t error = 0, io_count, contig, alloc_sz, i;
3458 uint32_t io_off;
3459 mio_master_t *mm = NULL;
3460 mio_slave_t *ms = NULL;
3461 struct buf *rbp;
3462
3463 ASSERT(!(start & PAGEOFFSET));
3464
3465 /*
3466 * Figure out how many buffers to allocate
3467 */
3468 io_count = 0;
3469 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3470 contig = 0;
3471 if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3472 &bn, &contig)) {
3473 goto end;
3474 }
3475 if (contig == 0) {
3476 goto end;
3477 }
3478 contig = MIN(contig, PAGESIZE - io_off);
3479 if (bn != UDF_HOLE) {
3480 io_count ++;
3481 } else {
3482 /*
3483 * HOLE
3484 */
3485 if (bp->b_flags & B_READ) {
3486
3487 /*
3488 * This is a hole and is read
3489 * it should be filled with 0's
3490 */
3491 pagezero(pp, io_off, contig);
3492 }
3493 }
3494 }
3495
3496
3497 if (io_count != 0) {
3498
3499 /*
3500 * Allocate memory for all the
3501 * required number of buffers
3502 */
3503 alloc_sz = sizeof (mio_master_t) +
3504 (sizeof (mio_slave_t) * io_count);
3505 mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3506 if (mm == NULL) {
3507 error = ENOMEM;
3508 goto end;
3509 }
3510
3511 /*
3512 * initialize master
3513 */
3514 mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3515 mm->mm_size = alloc_sz;
3516 mm->mm_bp = bp;
3517 mm->mm_resid = 0;
3518 mm->mm_error = 0;
3519 mm->mm_index = master_index++;
3520
3521 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3522
3523 /*
3524 * Initialize buffers
3525 */
3526 io_count = 0;
3527 for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3528 contig = 0;
3529 if (error = ud_bmap_read(ip,
3530 (u_offset_t)(start + io_off),
3531 &bn, &contig)) {
3532 goto end;
3533 }
3534 ASSERT(contig);
3535 if ((io_off + contig) > bp->b_bcount) {
3536 contig = bp->b_bcount - io_off;
3537 }
3538 if (bn != UDF_HOLE) {
3539 /*
3540 * Clone the buffer
3541 * and prepare to start I/O
3542 */
3543 ms->ms_ptr = mm;
3544 bioinit(&ms->ms_buf);
3545 rbp = bioclone(bp, io_off, (size_t)contig,
3546 bp->b_edev, bn, ud_slave_done,
3547 &ms->ms_buf, KM_NOSLEEP);
3548 ASSERT(rbp == &ms->ms_buf);
3549 mm->mm_resid += contig;
3550 io_count++;
3551 ms ++;
3552 }
3553 }
3554
3555 /*
3556 * Start I/O's
3557 */
3558 ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3559 for (i = 0; i < io_count; i++) {
3560 (void) bdev_strategy(&ms->ms_buf);
3561 ms ++;
3562 }
3563 }
3564
3565 end:
3566 if (error != 0) {
3567 bp->b_flags |= B_ERROR;
3568 bp->b_error = error;
3569 if (mm != NULL) {
3570 mutex_destroy(&mm->mm_mutex);
3571 kmem_free(mm, mm->mm_size);
3572 }
3573 }
3574 return (error);
3575 }
3576
3577 int32_t
3578 ud_slave_done(struct buf *bp)
3579 {
3580 mio_master_t *mm;
3581 int32_t resid;
3582
3583 ASSERT(SEMA_HELD(&bp->b_sem));
3584 ASSERT((bp->b_flags & B_DONE) == 0);
3585
3586 mm = ((mio_slave_t *)bp)->ms_ptr;
3587
3588 /*
3589 * Propagate error and byte count info from slave struct to
3590 * the master struct
3591 */
3592 mutex_enter(&mm->mm_mutex);
3593 if (bp->b_flags & B_ERROR) {
3594
3595 /*
3596 * If multiple slave buffers get
3597 * error we forget the old errors
3598 * this is ok because we any way
3599 * cannot return multiple errors
3600 */
3601 mm->mm_error = bp->b_error;
3602 }
3603 mm->mm_resid -= bp->b_bcount;
3604 resid = mm->mm_resid;
3605 mutex_exit(&mm->mm_mutex);
3606
3607 /*
3608 * free up the resources allocated to cloned buffers.
3609 */
3610 bp_mapout(bp);
3611 biofini(bp);
3612
3613 if (resid == 0) {
3614
3615 /*
3616 * This is the last I/O operation
3617 * clean up and return the original buffer
3618 */
3619 if (mm->mm_error) {
3620 mm->mm_bp->b_flags |= B_ERROR;
3621 mm->mm_bp->b_error = mm->mm_error;
3622 }
3623 biodone(mm->mm_bp);
3624 mutex_destroy(&mm->mm_mutex);
3625 kmem_free(mm, mm->mm_size);
3626 }
3627 return (0);
3628 }