Print this page
OS-4319 zfs mishandles partial writes


 648  * Timestamps:
 649  *      vp - ctime|mtime updated if byte count > 0
 650  */
 651 
 652 /* ARGSUSED */
 653 static int
 654 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 655 {
 656         znode_t         *zp = VTOZ(vp);
 657         rlim64_t        limit = uio->uio_llimit;
 658         ssize_t         start_resid = uio->uio_resid;
 659         ssize_t         tx_bytes;
 660         uint64_t        end_size;
 661         dmu_tx_t        *tx;
 662         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 663         zilog_t         *zilog;
 664         offset_t        woff;
 665         ssize_t         n, nbytes;
 666         int             max_blksz = zfsvfs->z_max_blksz;
 667         int             error = 0;

 668         arc_buf_t       *abuf;
 669         iovec_t         *aiov = NULL;
 670         xuio_t          *xuio = NULL;
 671         int             i_iov = 0;
 672         int             iovcnt = uio->uio_iovcnt;
 673         iovec_t         *iovp = uio->uio_iov;
 674         int             write_eof;
 675         int             count = 0;
 676         sa_bulk_attr_t  bulk[4];
 677         uint64_t        mtime[2], ctime[2];
 678 
 679         /*
 680          * Fasttrack empty write
 681          */
 682         n = start_resid;
 683         if (n == 0)
 684                 return (0);
 685 
 686         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 687                 limit = MAXOFFSET_T;


 955                     secpolicy_vnode_setid_retain(cr,
 956                     (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 957                         uint64_t newmode;
 958                         zp->z_mode &= ~(S_ISUID | S_ISGID);
 959                         newmode = zp->z_mode;
 960                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 961                             (void *)&newmode, sizeof (uint64_t), tx);
 962                 }
 963                 mutex_exit(&zp->z_acl_lock);
 964 
 965                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 966                     B_TRUE);
 967 
 968                 /*
 969                  * Update the file size (zp_size) if it has changed;
 970                  * account for possible concurrent updates.
 971                  */
 972                 while ((end_size = zp->z_size) < uio->uio_loffset) {
 973                         (void) atomic_cas_64(&zp->z_size, end_size,
 974                             uio->uio_loffset);
 975                         ASSERT(error == 0);
 976                 }
 977                 /*
 978                  * If we are replaying and eof is non zero then force
 979                  * the file size to the specified eof. Note, there's no
 980                  * concurrency during replay.
 981                  */
 982                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 983                         zp->z_size = zfsvfs->z_replay_eof;
 984 





 985                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 986 
 987                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 988                 dmu_tx_commit(tx);
 989 
 990                 if (error != 0)
 991                         break;
 992                 ASSERT(tx_bytes == nbytes);
 993                 n -= nbytes;
 994 
 995                 if (!xuio && n > 0)
 996                         uio_prefaultpages(MIN(n, max_blksz), uio);
 997         }
 998 
 999         rangelock_exit(lr);
1000 
1001         /*
1002          * If we're in replay mode, or we made no progress, return error.
1003          * Otherwise, it's at least a partial write, so it's successful.
1004          */
1005         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1006                 ZFS_EXIT(zfsvfs);
1007                 return (error);
1008         }
1009 
1010         if (ioflag & (FSYNC | FDSYNC) ||




 648  * Timestamps:
 649  *      vp - ctime|mtime updated if byte count > 0
 650  */
 651 
 652 /* ARGSUSED */
 653 static int
 654 zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 655 {
 656         znode_t         *zp = VTOZ(vp);
 657         rlim64_t        limit = uio->uio_llimit;
 658         ssize_t         start_resid = uio->uio_resid;
 659         ssize_t         tx_bytes;
 660         uint64_t        end_size;
 661         dmu_tx_t        *tx;
 662         zfsvfs_t        *zfsvfs = zp->z_zfsvfs;
 663         zilog_t         *zilog;
 664         offset_t        woff;
 665         ssize_t         n, nbytes;
 666         int             max_blksz = zfsvfs->z_max_blksz;
 667         int             error = 0;
 668         int             prev_error;
 669         arc_buf_t       *abuf;
 670         iovec_t         *aiov = NULL;
 671         xuio_t          *xuio = NULL;
 672         int             i_iov = 0;
 673         int             iovcnt = uio->uio_iovcnt;
 674         iovec_t         *iovp = uio->uio_iov;
 675         int             write_eof;
 676         int             count = 0;
 677         sa_bulk_attr_t  bulk[4];
 678         uint64_t        mtime[2], ctime[2];
 679 
 680         /*
 681          * Fasttrack empty write
 682          */
 683         n = start_resid;
 684         if (n == 0)
 685                 return (0);
 686 
 687         if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T)
 688                 limit = MAXOFFSET_T;


 956                     secpolicy_vnode_setid_retain(cr,
 957                     (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) {
 958                         uint64_t newmode;
 959                         zp->z_mode &= ~(S_ISUID | S_ISGID);
 960                         newmode = zp->z_mode;
 961                         (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs),
 962                             (void *)&newmode, sizeof (uint64_t), tx);
 963                 }
 964                 mutex_exit(&zp->z_acl_lock);
 965 
 966                 zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime,
 967                     B_TRUE);
 968 
 969                 /*
 970                  * Update the file size (zp_size) if it has changed;
 971                  * account for possible concurrent updates.
 972                  */
 973                 while ((end_size = zp->z_size) < uio->uio_loffset) {
 974                         (void) atomic_cas_64(&zp->z_size, end_size,
 975                             uio->uio_loffset);

 976                 }
 977                 /*
 978                  * If we are replaying and eof is non zero then force
 979                  * the file size to the specified eof. Note, there's no
 980                  * concurrency during replay.
 981                  */
 982                 if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0)
 983                         zp->z_size = zfsvfs->z_replay_eof;
 984 
 985                 /*
 986                  * Keep track of a possible pre-existing error from a partial
 987                  * write via dmu_write_uio_dbuf above.
 988                  */
 989                 prev_error = error;
 990                 error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 991 
 992                 zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag);
 993                 dmu_tx_commit(tx);
 994 
 995                 if (prev_error != 0 || error != 0)
 996                         break;
 997                 ASSERT(tx_bytes == nbytes);
 998                 n -= nbytes;
 999 
1000                 if (!xuio && n > 0)
1001                         uio_prefaultpages(MIN(n, max_blksz), uio);
1002         }
1003 
1004         rangelock_exit(lr);
1005 
1006         /*
1007          * If we're in replay mode, or we made no progress, return error.
1008          * Otherwise, it's at least a partial write, so it's successful.
1009          */
1010         if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
1011                 ZFS_EXIT(zfsvfs);
1012                 return (error);
1013         }
1014 
1015         if (ioflag & (FSYNC | FDSYNC) ||