Print this page
OS-5483 iostat -x shows around 100% utilization for idle zone
Reviewed by: Joshua M. Clulow <jmc@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Approved by: Robert Mustacchi <rm@joyent.com>
OS-5148 ftruncate at offset should emit proper events
Reviewed by: Bryan Cantrill <bryan@joyent.com>
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
OS-338 Kstat counters to show "slow" VFS operations
OS-3294 add support for inotify
Reviewed by: Jerry Jelinek <jerry.jelinek@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>


   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/t_lock.h>
  43 #include <sys/errno.h>
  44 #include <sys/cred.h>


 183                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 184                 (*stataddr)++;                                          \
 185                 vsp->bytecounter.value.ui64 += bytesval;             \
 186                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 187                         vsp->n##counter.value.ui64++;                        \
 188                         vsp->bytecounter.value.ui64 += bytesval;     \
 189                 }                                                       \
 190         }                                                               \
 191 }
 192 
 193 /*
 194  * If the filesystem does not support XIDs map credential
 195  * If the vfsp is NULL, perhaps we should also map?
 196  */
 197 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 198         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 199         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 200                 cr = crgetmapped(cr);                                   \
 201         }
 202 




 203 /*
 204  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 205  * numerical order of S_IFMT and vnode types.)
 206  */
 207 enum vtype iftovt_tab[] = {
 208         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 209         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 210 };
 211 
 212 ushort_t vttoif_tab[] = {
 213         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 214         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 215 };
 216 
 217 /*
 218  * The system vnode cache.
 219  */
 220 
 221 kmem_cache_t *vn_cache;
 222 


2499         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2500 }
2501 
2502 /* Vnode event notification */
2503 
2504 int
2505 vnevent_support(vnode_t *vp, caller_context_t *ct)
2506 {
2507         if (vp == NULL)
2508                 return (EINVAL);
2509 
2510         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2511 }
2512 
2513 void
2514 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2515 {
2516         if (vp == NULL || vp->v_femhead == NULL) {
2517                 return;
2518         }

2519         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2520 }
2521 
2522 void
2523 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2524     caller_context_t *ct)
2525 {
2526         if (vp == NULL || vp->v_femhead == NULL) {
2527                 return;
2528         }
2529         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2530 }
2531 
2532 void
2533 vnevent_rename_dest_dir(vnode_t *vp, caller_context_t *ct)

2534 {
2535         if (vp == NULL || vp->v_femhead == NULL) {
2536                 return;
2537         }
2538         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, NULL, NULL, ct);
2539 }
2540 
2541 void
2542 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2543 {
2544         if (vp == NULL || vp->v_femhead == NULL) {
2545                 return;
2546         }
2547         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2548 }
2549 
2550 void
2551 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2552 {
2553         if (vp == NULL || vp->v_femhead == NULL) {
2554                 return;
2555         }
2556         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2557 }
2558 


2605 }
2606 
2607 void
2608 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2609 {
2610         if (vp == NULL || vp->v_femhead == NULL) {
2611                 return;
2612         }
2613         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2614 }
2615 
2616 void
2617 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2618 {
2619         if (vp == NULL || vp->v_femhead == NULL) {
2620                 return;
2621         }
2622         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2623 }
2624 









2625 /*
2626  * Vnode accessors.
2627  */
2628 
2629 int
2630 vn_is_readonly(vnode_t *vp)
2631 {
2632         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2633 }
2634 
2635 int
2636 vn_has_flocks(vnode_t *vp)
2637 {
2638         return (vp->v_filocks != NULL);
2639 }
2640 
2641 int
2642 vn_has_mandatory_locks(vnode_t *vp, int mode)
2643 {
2644         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));


3244                 if (flag & FREAD) {
3245                         ASSERT(vp->v_rdcnt > 0);
3246                         atomic_dec_32(&vp->v_rdcnt);
3247                 }
3248                 if (flag & FWRITE) {
3249                         ASSERT(vp->v_wrcnt > 0);
3250                         atomic_dec_32(&vp->v_wrcnt);
3251                 }
3252         }
3253         return (err);
3254 }
3255 
3256 int
3257 fop_read(
3258         vnode_t *vp,
3259         uio_t *uiop,
3260         int ioflag,
3261         cred_t *cr,
3262         caller_context_t *ct)
3263 {
3264         int     err;
3265         ssize_t resid_start = uiop->uio_resid;


3266 













3267         VOPXID_MAP_CR(vp, cr);
3268 
3269         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3270         VOPSTATS_UPDATE_IO(vp, read,
3271             read_bytes, (resid_start - uiop->uio_resid));

























3272         return (err);
3273 }
3274 
3275 int
3276 fop_write(
3277         vnode_t *vp,
3278         uio_t *uiop,
3279         int ioflag,
3280         cred_t *cr,
3281         caller_context_t *ct)
3282 {
3283         int     err;
3284         ssize_t resid_start = uiop->uio_resid;


3285 


















3286         VOPXID_MAP_CR(vp, cr);
3287 
3288         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3289         VOPSTATS_UPDATE_IO(vp, write,
3290             write_bytes, (resid_start - uiop->uio_resid));

























3291         return (err);
3292 }
3293 
3294 int
3295 fop_ioctl(
3296         vnode_t *vp,
3297         int cmd,
3298         intptr_t arg,
3299         int flag,
3300         cred_t *cr,
3301         int *rvalp,
3302         caller_context_t *ct)
3303 {
3304         int     err;
3305 
3306         VOPXID_MAP_CR(vp, cr);
3307 
3308         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3309         VOPSTATS_UPDATE(vp, ioctl);
3310         return (err);




   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved.
  24  * Copyright (c) 2014, Joyent, Inc. All rights reserved.
  25  */
  26 
  27 /*      Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
  28 /*        All Rights Reserved   */
  29 
  30 /*
  31  * University Copyright- Copyright (c) 1982, 1986, 1988
  32  * The Regents of the University of California
  33  * All Rights Reserved
  34  *
  35  * University Acknowledgment- Portions of this document are derived from
  36  * software developed by the University of California, Berkeley, and its
  37  * contributors.
  38  */
  39 
  40 #include <sys/types.h>
  41 #include <sys/param.h>
  42 #include <sys/t_lock.h>
  43 #include <sys/errno.h>
  44 #include <sys/cred.h>


 183                 __dtrace_probe___fsinfo_##counter(vp, bytesval, stataddr); \
 184                 (*stataddr)++;                                          \
 185                 vsp->bytecounter.value.ui64 += bytesval;             \
 186                 if ((vsp = vfsp->vfs_fstypevsp) != NULL) {           \
 187                         vsp->n##counter.value.ui64++;                        \
 188                         vsp->bytecounter.value.ui64 += bytesval;     \
 189                 }                                                       \
 190         }                                                               \
 191 }
 192 
 193 /*
 194  * If the filesystem does not support XIDs map credential
 195  * If the vfsp is NULL, perhaps we should also map?
 196  */
 197 #define VOPXID_MAP_CR(vp, cr)   {                                       \
 198         vfs_t *vfsp = (vp)->v_vfsp;                                  \
 199         if (vfsp != NULL && (vfsp->vfs_flag & VFS_XID) == 0)             \
 200                 cr = crgetmapped(cr);                                   \
 201         }
 202 
 203 #define VOP_LATENCY_10MS        10000000
 204 #define VOP_LATENCY_100MS       100000000
 205 #define VOP_LATENCY_1S          1000000000
 206 
 207 /*
 208  * Convert stat(2) formats to vnode types and vice versa.  (Knows about
 209  * numerical order of S_IFMT and vnode types.)
 210  */
 211 enum vtype iftovt_tab[] = {
 212         VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 213         VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VNON
 214 };
 215 
 216 ushort_t vttoif_tab[] = {
 217         0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFIFO,
 218         S_IFDOOR, 0, S_IFSOCK, S_IFPORT, 0
 219 };
 220 
 221 /*
 222  * The system vnode cache.
 223  */
 224 
 225 kmem_cache_t *vn_cache;
 226 


2503         (void) VFS_VNSTATE(vfsp, vp, VNTRANS_DESTROYED);
2504 }
2505 
2506 /* Vnode event notification */
2507 
2508 int
2509 vnevent_support(vnode_t *vp, caller_context_t *ct)
2510 {
2511         if (vp == NULL)
2512                 return (EINVAL);
2513 
2514         return (VOP_VNEVENT(vp, VE_SUPPORT, NULL, NULL, ct));
2515 }
2516 
2517 void
2518 vnevent_rename_src(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2519 {
2520         if (vp == NULL || vp->v_femhead == NULL) {
2521                 return;
2522         }
2523         (void) VOP_VNEVENT(dvp, VE_RENAME_SRC_DIR, vp, name, ct);
2524         (void) VOP_VNEVENT(vp, VE_RENAME_SRC, dvp, name, ct);
2525 }
2526 
2527 void
2528 vnevent_rename_dest(vnode_t *vp, vnode_t *dvp, char *name,
2529     caller_context_t *ct)
2530 {
2531         if (vp == NULL || vp->v_femhead == NULL) {
2532                 return;
2533         }
2534         (void) VOP_VNEVENT(vp, VE_RENAME_DEST, dvp, name, ct);
2535 }
2536 
2537 void
2538 vnevent_rename_dest_dir(vnode_t *vp, vnode_t *nvp, char *name,
2539     caller_context_t *ct)
2540 {
2541         if (vp == NULL || vp->v_femhead == NULL) {
2542                 return;
2543         }
2544         (void) VOP_VNEVENT(vp, VE_RENAME_DEST_DIR, nvp, name, ct);
2545 }
2546 
2547 void
2548 vnevent_remove(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2549 {
2550         if (vp == NULL || vp->v_femhead == NULL) {
2551                 return;
2552         }
2553         (void) VOP_VNEVENT(vp, VE_REMOVE, dvp, name, ct);
2554 }
2555 
2556 void
2557 vnevent_rmdir(vnode_t *vp, vnode_t *dvp, char *name, caller_context_t *ct)
2558 {
2559         if (vp == NULL || vp->v_femhead == NULL) {
2560                 return;
2561         }
2562         (void) VOP_VNEVENT(vp, VE_RMDIR, dvp, name, ct);
2563 }
2564 


2611 }
2612 
2613 void
2614 vnevent_mountedover(vnode_t *vp, caller_context_t *ct)
2615 {
2616         if (vp == NULL || vp->v_femhead == NULL) {
2617                 return;
2618         }
2619         (void) VOP_VNEVENT(vp, VE_MOUNTEDOVER, NULL, NULL, ct);
2620 }
2621 
2622 void
2623 vnevent_truncate(vnode_t *vp, caller_context_t *ct)
2624 {
2625         if (vp == NULL || vp->v_femhead == NULL) {
2626                 return;
2627         }
2628         (void) VOP_VNEVENT(vp, VE_TRUNCATE, NULL, NULL, ct);
2629 }
2630 
2631 void
2632 vnevent_resize(vnode_t *vp, caller_context_t *ct)
2633 {
2634         if (vp == NULL || vp->v_femhead == NULL) {
2635                 return;
2636         }
2637         (void) VOP_VNEVENT(vp, VE_RESIZE, NULL, NULL, ct);
2638 }
2639 
2640 /*
2641  * Vnode accessors.
2642  */
2643 
2644 int
2645 vn_is_readonly(vnode_t *vp)
2646 {
2647         return (vp->v_vfsp->vfs_flag & VFS_RDONLY);
2648 }
2649 
2650 int
2651 vn_has_flocks(vnode_t *vp)
2652 {
2653         return (vp->v_filocks != NULL);
2654 }
2655 
2656 int
2657 vn_has_mandatory_locks(vnode_t *vp, int mode)
2658 {
2659         return ((vp->v_filocks != NULL) && (MANDLOCK(vp, mode)));


3259                 if (flag & FREAD) {
3260                         ASSERT(vp->v_rdcnt > 0);
3261                         atomic_dec_32(&vp->v_rdcnt);
3262                 }
3263                 if (flag & FWRITE) {
3264                         ASSERT(vp->v_wrcnt > 0);
3265                         atomic_dec_32(&vp->v_wrcnt);
3266                 }
3267         }
3268         return (err);
3269 }
3270 
3271 int
3272 fop_read(
3273         vnode_t *vp,
3274         uio_t *uiop,
3275         int ioflag,
3276         cred_t *cr,
3277         caller_context_t *ct)
3278 {

3279         ssize_t resid_start = uiop->uio_resid;
3280         zone_t  *zonep = curzone;
3281         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3282 
3283         hrtime_t start = 0, lat;
3284         ssize_t len;
3285         int err;
3286 
3287         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3288             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3289                 start = gethrtime();
3290 
3291                 mutex_enter(&zonep->zone_vfs_lock);
3292                 kstat_runq_enter(&zonep->zone_vfs_rwstats);
3293                 mutex_exit(&zonep->zone_vfs_lock);
3294         }
3295 
3296         VOPXID_MAP_CR(vp, cr);
3297 
3298         err = (*(vp)->v_op->vop_read)(vp, uiop, ioflag, cr, ct);
3299         len = resid_start - uiop->uio_resid;
3300 
3301         VOPSTATS_UPDATE_IO(vp, read, read_bytes, len);
3302 
3303         if (start != 0) {
3304                 mutex_enter(&zonep->zone_vfs_lock);
3305                 zonep->zone_vfs_rwstats.reads++;
3306                 zonep->zone_vfs_rwstats.nread += len;
3307                 kstat_runq_exit(&zonep->zone_vfs_rwstats);
3308                 mutex_exit(&zonep->zone_vfs_lock);
3309 
3310                 lat = gethrtime() - start;
3311 
3312                 if (lat >= VOP_LATENCY_10MS) {
3313                         if (lat < VOP_LATENCY_100MS)
3314                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3315                         else if (lat < VOP_LATENCY_1S) {
3316                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3317                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3318                         } else {
3319                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3320                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3321                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3322                         }
3323                 }
3324         }
3325 
3326         return (err);
3327 }
3328 
3329 int
3330 fop_write(
3331         vnode_t *vp,
3332         uio_t *uiop,
3333         int ioflag,
3334         cred_t *cr,
3335         caller_context_t *ct)
3336 {

3337         ssize_t resid_start = uiop->uio_resid;
3338         zone_t  *zonep = curzone;
3339         zone_vfs_kstat_t *zvp = zonep->zone_vfs_stats;
3340 
3341         hrtime_t start = 0, lat;
3342         ssize_t len;
3343         int     err;
3344 
3345         /*
3346          * For the purposes of VFS kstat consumers, the "waitq" calculation is
3347          * repurposed as the active queue for VFS write operations.  There's no
3348          * actual wait queue for VFS operations.
3349          */
3350         if ((vp->v_type == VREG || vp->v_type == VDIR || vp->v_type == VBLK) &&
3351             vp->v_vfsp != NULL && (vp->v_vfsp->vfs_flag & VFS_STATS)) {
3352                 start = gethrtime();
3353 
3354                 mutex_enter(&zonep->zone_vfs_lock);
3355                 kstat_waitq_enter(&zonep->zone_vfs_rwstats);
3356                 mutex_exit(&zonep->zone_vfs_lock);
3357         }
3358 
3359         VOPXID_MAP_CR(vp, cr);
3360 
3361         err = (*(vp)->v_op->vop_write)(vp, uiop, ioflag, cr, ct);
3362         len = resid_start - uiop->uio_resid;
3363 
3364         VOPSTATS_UPDATE_IO(vp, write, write_bytes, len);
3365 
3366         if (start != 0) {
3367                 mutex_enter(&zonep->zone_vfs_lock);
3368                 zonep->zone_vfs_rwstats.writes++;
3369                 zonep->zone_vfs_rwstats.nwritten += len;
3370                 kstat_waitq_exit(&zonep->zone_vfs_rwstats);
3371                 mutex_exit(&zonep->zone_vfs_lock);
3372 
3373                 lat = gethrtime() - start;
3374 
3375                 if (lat >= VOP_LATENCY_10MS) {
3376                         if (lat < VOP_LATENCY_100MS)
3377                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3378                         else if (lat < VOP_LATENCY_1S) {
3379                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3380                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3381                         } else {
3382                                 atomic_inc_64(&zvp->zv_10ms_ops.value.ui64);
3383                                 atomic_inc_64(&zvp->zv_100ms_ops.value.ui64);
3384                                 atomic_inc_64(&zvp->zv_1s_ops.value.ui64);
3385                         }
3386                 }
3387         }
3388 
3389         return (err);
3390 }
3391 
3392 int
3393 fop_ioctl(
3394         vnode_t *vp,
3395         int cmd,
3396         intptr_t arg,
3397         int flag,
3398         cred_t *cr,
3399         int *rvalp,
3400         caller_context_t *ct)
3401 {
3402         int     err;
3403 
3404         VOPXID_MAP_CR(vp, cr);
3405 
3406         err = (*(vp)->v_op->vop_ioctl)(vp, cmd, arg, flag, cr, rvalp, ct);
3407         VOPSTATS_UPDATE(vp, ioctl);
3408         return (err);