Print this page
    
    
      
        | Split | 
	Close | 
      
      | Expand all | 
      | Collapse all | 
    
    
          --- old/usr/src/uts/common/fs/udfs/udf_vnops.c
          +++ new/usr/src/uts/common/fs/udfs/udf_vnops.c
   1    1  /*
   2    2   * CDDL HEADER START
   3    3   *
   4    4   * The contents of this file are subject to the terms of the
   5    5   * Common Development and Distribution License (the "License").
   6    6   * You may not use this file except in compliance with the License.
   7    7   *
   8    8   * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9    9   * or http://www.opensolaris.org/os/licensing.
  10   10   * See the License for the specific language governing permissions
  11   11   * and limitations under the License.
  12   12   *
  13   13   * When distributing Covered Code, include this CDDL HEADER in each
  14   14   * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15   15   * If applicable, add the following below this CDDL HEADER, with the
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  
  22   22  /*
  23   23   * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
  24   24   */
  25   25  
  26   26  /*
  27   27   * Copyright 2015, Joyent, Inc.
  28   28   */
  29   29  
  30   30  #include <sys/types.h>
  31   31  #include <sys/t_lock.h>
  32   32  #include <sys/param.h>
  33   33  #include <sys/time.h>
  34   34  #include <sys/systm.h>
  35   35  #include <sys/sysmacros.h>
  36   36  #include <sys/resource.h>
  37   37  #include <sys/signal.h>
  38   38  #include <sys/cred.h>
  39   39  #include <sys/user.h>
  40   40  #include <sys/buf.h>
  41   41  #include <sys/vfs.h>
  42   42  #include <sys/vfs_opreg.h>
  43   43  #include <sys/stat.h>
  44   44  #include <sys/vnode.h>
  45   45  #include <sys/mode.h>
  46   46  #include <sys/proc.h>
  47   47  #include <sys/disp.h>
  48   48  #include <sys/file.h>
  49   49  #include <sys/fcntl.h>
  50   50  #include <sys/flock.h>
  51   51  #include <sys/kmem.h>
  52   52  #include <sys/uio.h>
  53   53  #include <sys/dnlc.h>
  54   54  #include <sys/conf.h>
  55   55  #include <sys/errno.h>
  56   56  #include <sys/mman.h>
  57   57  #include <sys/fbuf.h>
  58   58  #include <sys/pathname.h>
  59   59  #include <sys/debug.h>
  60   60  #include <sys/vmsystm.h>
  61   61  #include <sys/cmn_err.h>
  62   62  #include <sys/dirent.h>
  63   63  #include <sys/errno.h>
  64   64  #include <sys/modctl.h>
  65   65  #include <sys/statvfs.h>
  66   66  #include <sys/mount.h>
  67   67  #include <sys/sunddi.h>
  68   68  #include <sys/bootconf.h>
  69   69  #include <sys/policy.h>
  70   70  
  71   71  #include <vm/hat.h>
  72   72  #include <vm/page.h>
  73   73  #include <vm/pvn.h>
  74   74  #include <vm/as.h>
  75   75  #include <vm/seg.h>
  76   76  #include <vm/seg_map.h>
  77   77  #include <vm/seg_kmem.h>
  78   78  #include <vm/seg_vn.h>
  79   79  #include <vm/rm.h>
  80   80  #include <vm/page.h>
  81   81  #include <sys/swap.h>
  82   82  
  83   83  #include <fs/fs_subr.h>
  84   84  
  85   85  #include <sys/fs/udf_volume.h>
  86   86  #include <sys/fs/udf_inode.h>
  87   87  
  88   88  static int32_t udf_open(struct vnode **,
  89   89          int32_t, struct cred *, caller_context_t *);
  90   90  static int32_t udf_close(struct vnode *,
  91   91          int32_t, int32_t, offset_t, struct cred *, caller_context_t *);
  92   92  static int32_t udf_read(struct vnode *,
  93   93          struct uio *, int32_t, struct cred *, caller_context_t *);
  94   94  static int32_t udf_write(struct vnode *,
  95   95          struct uio *, int32_t, struct cred *, caller_context_t *);
  96   96  static int32_t udf_ioctl(struct vnode *,
  97   97          int32_t, intptr_t, int32_t, struct cred *, int32_t *,
  98   98          caller_context_t *);
  99   99  static int32_t udf_getattr(struct vnode *,
 100  100          struct vattr *, int32_t, struct cred *, caller_context_t *);
 101  101  static int32_t udf_setattr(struct vnode *,
 102  102          struct vattr *, int32_t, struct cred *, caller_context_t *);
 103  103  static int32_t udf_access(struct vnode *,
 104  104          int32_t, int32_t, struct cred *, caller_context_t *);
 105  105  static int32_t udf_lookup(struct vnode *,
 106  106          char *, struct vnode **, struct pathname *,
 107  107          int32_t, struct vnode *, struct cred *,
 108  108          caller_context_t *, int *, pathname_t *);
 109  109  static int32_t udf_create(struct vnode *,
 110  110          char *, struct vattr *, enum vcexcl,
 111  111          int32_t, struct vnode **, struct cred *, int32_t,
 112  112          caller_context_t *, vsecattr_t *);
 113  113  static int32_t udf_remove(struct vnode *,
 114  114          char *, struct cred *, caller_context_t *, int);
 115  115  static int32_t udf_link(struct vnode *,
 116  116          struct vnode *, char *, struct cred *, caller_context_t *, int);
 117  117  static int32_t udf_rename(struct vnode *,
 118  118          char *, struct vnode *, char *, struct cred *, caller_context_t *, int);
 119  119  static int32_t udf_mkdir(struct vnode *,
 120  120          char *, struct vattr *, struct vnode **, struct cred *,
 121  121          caller_context_t *, int, vsecattr_t *);
 122  122  static int32_t udf_rmdir(struct vnode *,
 123  123          char *, struct vnode *, struct cred *, caller_context_t *, int);
 124  124  static int32_t udf_readdir(struct vnode *,
 125  125          struct uio *, struct cred *, int32_t *, caller_context_t *, int);
 126  126  static int32_t udf_symlink(struct vnode *,
 127  127          char *, struct vattr *, char *, struct cred *, caller_context_t *, int);
 128  128  static int32_t udf_readlink(struct vnode *,
 129  129          struct uio *, struct cred *, caller_context_t *);
 130  130  static int32_t udf_fsync(struct vnode *,
 131  131          int32_t, struct cred *, caller_context_t *);
 132  132  static void udf_inactive(struct vnode *,
 133  133          struct cred *, caller_context_t *);
 134  134  static int32_t udf_fid(struct vnode *, struct fid *, caller_context_t *);
 135  135  static int udf_rwlock(struct vnode *, int32_t, caller_context_t *);
 136  136  static void udf_rwunlock(struct vnode *, int32_t, caller_context_t *);
 137  137  static int32_t udf_seek(struct vnode *, offset_t, offset_t *,
 138  138          caller_context_t *);
 139  139  static int32_t udf_frlock(struct vnode *, int32_t,
 140  140          struct flock64 *, int32_t, offset_t, struct flk_callback *, cred_t *,
 141  141          caller_context_t *);
 142  142  static int32_t udf_space(struct vnode *, int32_t,
 143  143          struct flock64 *, int32_t, offset_t, cred_t *, caller_context_t *);
 144  144  static int32_t udf_getpage(struct vnode *, offset_t,
 145  145          size_t, uint32_t *, struct page **, size_t,
 146  146          struct seg *, caddr_t, enum seg_rw, struct cred *, caller_context_t *);
 147  147  static int32_t udf_putpage(struct vnode *, offset_t,
 148  148          size_t, int32_t, struct cred *, caller_context_t *);
 149  149  static int32_t udf_map(struct vnode *, offset_t, struct as *,
 150  150          caddr_t *, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
 151  151          caller_context_t *);
 152  152  static int32_t udf_addmap(struct vnode *, offset_t, struct as *,
 153  153          caddr_t, size_t, uint8_t, uint8_t, uint32_t, struct cred *,
 154  154          caller_context_t *);
 155  155  static int32_t udf_delmap(struct vnode *, offset_t, struct as *,
 156  156          caddr_t, size_t, uint32_t, uint32_t, uint32_t, struct cred *,
 157  157          caller_context_t *);
 158  158  static int32_t udf_l_pathconf(struct vnode *, int32_t,
 159  159          ulong_t *, struct cred *, caller_context_t *);
 160  160  static int32_t udf_pageio(struct vnode *, struct page *,
 161  161          u_offset_t, size_t, int32_t, struct cred *, caller_context_t *);
 162  162  
 163  163  int32_t ud_getpage_miss(struct vnode *, u_offset_t,
 164  164          size_t, struct seg *, caddr_t, page_t *pl[],
 165  165          size_t, enum seg_rw, int32_t);
 166  166  void ud_getpage_ra(struct vnode *, u_offset_t, struct seg *, caddr_t);
 167  167  int32_t ud_putpages(struct vnode *, offset_t, size_t, int32_t, struct cred *);
 168  168  int32_t ud_page_fill(struct ud_inode *, page_t *,
 169  169          u_offset_t, uint32_t, u_offset_t *);
 170  170  int32_t ud_iodone(struct buf *);
 171  171  int32_t ud_rdip(struct ud_inode *, struct uio *, int32_t, cred_t *);
 172  172  int32_t ud_wrip(struct ud_inode *, struct uio *, int32_t, cred_t *);
 173  173  int32_t ud_multi_strat(struct ud_inode *, page_t *, struct buf *, u_offset_t);
 174  174  int32_t ud_slave_done(struct buf *);
 175  175  
 176  176  /*
 177  177   * Structures to control multiple IO operations to get or put pages
 178  178   * that are backed by discontiguous blocks. The master struct is
 179  179   * a dummy that holds the original bp from pageio_setup. The
 180  180   * slave struct holds the working bp's to do the actual IO. Once
 181  181   * all the slave IOs complete. The master is processed as if a single
 182  182   * IO op has completed.
 183  183   */
 184  184  uint32_t master_index = 0;
 185  185  typedef struct mio_master {
 186  186          kmutex_t        mm_mutex;       /* protect the fields below */
 187  187          int32_t         mm_size;
 188  188          buf_t           *mm_bp;         /* original bp */
 189  189          int32_t         mm_resid;       /* bytes remaining to transfer */
 190  190          int32_t         mm_error;       /* accumulated error from slaves */
 191  191          int32_t         mm_index;       /* XXX debugging */
 192  192  } mio_master_t;
 193  193  
 194  194  typedef struct mio_slave {
 195  195          buf_t           ms_buf;         /* working buffer for this IO chunk */
 196  196          mio_master_t    *ms_ptr;        /* pointer to master */
 197  197  } mio_slave_t;
 198  198  
 199  199  struct vnodeops *udf_vnodeops;
 200  200  
 201  201  const fs_operation_def_t udf_vnodeops_template[] = {
 202  202          VOPNAME_OPEN,           { .vop_open = udf_open },
 203  203          VOPNAME_CLOSE,          { .vop_close = udf_close },
 204  204          VOPNAME_READ,           { .vop_read = udf_read },
 205  205          VOPNAME_WRITE,          { .vop_write = udf_write },
 206  206          VOPNAME_IOCTL,          { .vop_ioctl = udf_ioctl },
 207  207          VOPNAME_GETATTR,        { .vop_getattr = udf_getattr },
 208  208          VOPNAME_SETATTR,        { .vop_setattr = udf_setattr },
 209  209          VOPNAME_ACCESS,         { .vop_access = udf_access },
 210  210          VOPNAME_LOOKUP,         { .vop_lookup = udf_lookup },
 211  211          VOPNAME_CREATE,         { .vop_create = udf_create },
 212  212          VOPNAME_REMOVE,         { .vop_remove = udf_remove },
 213  213          VOPNAME_LINK,           { .vop_link = udf_link },
 214  214          VOPNAME_RENAME,         { .vop_rename = udf_rename },
 215  215          VOPNAME_MKDIR,          { .vop_mkdir = udf_mkdir },
 216  216          VOPNAME_RMDIR,          { .vop_rmdir = udf_rmdir },
 217  217          VOPNAME_READDIR,        { .vop_readdir = udf_readdir },
 218  218          VOPNAME_SYMLINK,        { .vop_symlink = udf_symlink },
 219  219          VOPNAME_READLINK,       { .vop_readlink = udf_readlink },
 220  220          VOPNAME_FSYNC,          { .vop_fsync = udf_fsync },
 221  221          VOPNAME_INACTIVE,       { .vop_inactive = udf_inactive },
 222  222          VOPNAME_FID,            { .vop_fid = udf_fid },
 223  223          VOPNAME_RWLOCK,         { .vop_rwlock = udf_rwlock },
 224  224          VOPNAME_RWUNLOCK,       { .vop_rwunlock = udf_rwunlock },
 225  225          VOPNAME_SEEK,           { .vop_seek = udf_seek },
 226  226          VOPNAME_FRLOCK,         { .vop_frlock = udf_frlock },
 227  227          VOPNAME_SPACE,          { .vop_space = udf_space },
 228  228          VOPNAME_GETPAGE,        { .vop_getpage = udf_getpage },
 229  229          VOPNAME_PUTPAGE,        { .vop_putpage = udf_putpage },
 230  230          VOPNAME_MAP,            { .vop_map = udf_map },
 231  231          VOPNAME_ADDMAP,         { .vop_addmap = udf_addmap },
 232  232          VOPNAME_DELMAP,         { .vop_delmap = udf_delmap },
 233  233          VOPNAME_PATHCONF,       { .vop_pathconf = udf_l_pathconf },
 234  234          VOPNAME_PAGEIO,         { .vop_pageio = udf_pageio },
 235  235          VOPNAME_VNEVENT,        { .vop_vnevent = fs_vnevent_support },
 236  236          NULL,                   NULL
 237  237  };
 238  238  
 239  239  /* ARGSUSED */
 240  240  static int32_t
 241  241  udf_open(
 242  242          struct vnode **vpp,
 243  243          int32_t flag,
 244  244          struct cred *cr,
 245  245          caller_context_t *ct)
 246  246  {
 247  247          ud_printf("udf_open\n");
 248  248  
 249  249          return (0);
 250  250  }
 251  251  
 252  252  /* ARGSUSED */
 253  253  static int32_t
 254  254  udf_close(
 255  255          struct vnode *vp,
 256  256          int32_t flag,
 257  257          int32_t count,
 258  258          offset_t offset,
 259  259          struct cred *cr,
 260  260          caller_context_t *ct)
 261  261  {
 262  262          struct ud_inode *ip = VTOI(vp);
 263  263  
 264  264          ud_printf("udf_close\n");
 265  265  
 266  266          ITIMES(ip);
 267  267  
 268  268          cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
 269  269          cleanshares(vp, ttoproc(curthread)->p_pid);
 270  270  
 271  271          /*
 272  272           * Push partially filled cluster at last close.
 273  273           * ``last close'' is approximated because the dnlc
 274  274           * may have a hold on the vnode.
 275  275           */
 276  276          if (vp->v_count <= 2 && vp->v_type != VBAD) {
 277  277                  struct ud_inode *ip = VTOI(vp);
 278  278                  if (ip->i_delaylen) {
 279  279                          (void) ud_putpages(vp, ip->i_delayoff, ip->i_delaylen,
 280  280                              B_ASYNC | B_FREE, cr);
 281  281                          ip->i_delaylen = 0;
 282  282                  }
 283  283          }
 284  284  
 285  285          return (0);
 286  286  }
 287  287  
 288  288  /* ARGSUSED */
 289  289  static int32_t
 290  290  udf_read(
 291  291          struct vnode *vp,
 292  292          struct uio *uiop,
 293  293          int32_t ioflag,
 294  294          struct cred *cr,
 295  295          caller_context_t *ct)
 296  296  {
 297  297          struct ud_inode *ip = VTOI(vp);
 298  298          int32_t error;
 299  299  
 300  300          ud_printf("udf_read\n");
 301  301  
 302  302  #ifdef  __lock_lint
 303  303          rw_enter(&ip->i_rwlock, RW_READER);
 304  304  #endif
 305  305  
 306  306          ASSERT(RW_READ_HELD(&ip->i_rwlock));
 307  307  
 308  308          if (MANDLOCK(vp, ip->i_char)) {
 309  309                  /*
 310  310                   * udf_getattr ends up being called by chklock
 311  311                   */
 312  312                  error = chklock(vp, FREAD, uiop->uio_loffset,
 313  313                      uiop->uio_resid, uiop->uio_fmode, ct);
 314  314                  if (error) {
 315  315                          goto end;
 316  316                  }
 317  317          }
 318  318  
 319  319          rw_enter(&ip->i_contents, RW_READER);
 320  320          error = ud_rdip(ip, uiop, ioflag, cr);
 321  321          rw_exit(&ip->i_contents);
 322  322  
 323  323  end:
 324  324  #ifdef  __lock_lint
 325  325          rw_exit(&ip->i_rwlock);
 326  326  #endif
 327  327  
 328  328          return (error);
 329  329  }
 330  330  
 331  331  
 332  332  int32_t ud_WRITES = 1;
 333  333  int32_t ud_HW = 96 * 1024;
 334  334  int32_t ud_LW = 64 * 1024;
 335  335  int32_t ud_throttles = 0;
 336  336  
 337  337  /* ARGSUSED */
 338  338  static int32_t
 339  339  udf_write(
 340  340          struct vnode *vp,
 341  341          struct uio *uiop,
 342  342          int32_t ioflag,
 343  343          struct cred *cr,
 344  344          caller_context_t *ct)
 345  345  {
 346  346          struct ud_inode *ip = VTOI(vp);
 347  347          int32_t error = 0;
 348  348  
 349  349          ud_printf("udf_write\n");
 350  350  
 351  351  #ifdef  __lock_lint
 352  352          rw_enter(&ip->i_rwlock, RW_WRITER);
 353  353  #endif
 354  354  
 355  355          ASSERT(RW_WRITE_HELD(&ip->i_rwlock));
 356  356  
 357  357          if (MANDLOCK(vp, ip->i_char)) {
 358  358                  /*
 359  359                   * ud_getattr ends up being called by chklock
 360  360                   */
 361  361                  error = chklock(vp, FWRITE, uiop->uio_loffset,
 362  362                      uiop->uio_resid, uiop->uio_fmode, ct);
 363  363                  if (error) {
 364  364                          goto end;
 365  365                  }
 366  366          }
 367  367          /*
 368  368           * Throttle writes.
 369  369           */
 370  370          mutex_enter(&ip->i_tlock);
 371  371          if (ud_WRITES && (ip->i_writes > ud_HW)) {
 372  372                  while (ip->i_writes > ud_HW) {
 373  373                          ud_throttles++;
 374  374                          cv_wait(&ip->i_wrcv, &ip->i_tlock);
 375  375                  }
 376  376          }
 377  377          mutex_exit(&ip->i_tlock);
 378  378  
 379  379          /*
 380  380           * Write to the file
 381  381           */
 382  382          rw_enter(&ip->i_contents, RW_WRITER);
 383  383          if ((ioflag & FAPPEND) != 0 && (ip->i_type == VREG)) {
 384  384                  /*
 385  385                   * In append mode start at end of file.
 386  386                   */
 387  387                  uiop->uio_loffset = ip->i_size;
 388  388          }
 389  389          error = ud_wrip(ip, uiop, ioflag, cr);
 390  390          rw_exit(&ip->i_contents);
 391  391  
 392  392  end:
 393  393  #ifdef  __lock_lint
 394  394          rw_exit(&ip->i_rwlock);
 395  395  #endif
 396  396  
 397  397          return (error);
 398  398  }
 399  399  
 400  400  /* ARGSUSED */
 401  401  static int32_t
 402  402  udf_ioctl(
 403  403          struct vnode *vp,
 404  404          int32_t cmd,
 405  405          intptr_t arg,
 406  406          int32_t flag,
 407  407          struct cred *cr,
 408  408          int32_t *rvalp,
 409  409          caller_context_t *ct)
 410  410  {
 411  411          return (ENOTTY);
 412  412  }
 413  413  
 414  414  /* ARGSUSED */
 415  415  static int32_t
 416  416  udf_getattr(
 417  417          struct vnode *vp,
 418  418          struct vattr *vap,
 419  419          int32_t flags,
 420  420          struct cred *cr,
 421  421          caller_context_t *ct)
 422  422  {
 423  423          struct ud_inode *ip = VTOI(vp);
 424  424  
 425  425          ud_printf("udf_getattr\n");
 426  426  
 427  427          if (vap->va_mask == AT_SIZE) {
 428  428                  /*
 429  429                   * for performance, if only the size is requested don't bother
 430  430                   * with anything else.
 431  431                   */
 432  432                  vap->va_size = ip->i_size;
 433  433                  return (0);
 434  434          }
 435  435  
 436  436          rw_enter(&ip->i_contents, RW_READER);
 437  437  
 438  438          vap->va_type = vp->v_type;
 439  439          vap->va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
 440  440  
 441  441          vap->va_uid = ip->i_uid;
 442  442          vap->va_gid = ip->i_gid;
 443  443          vap->va_fsid = ip->i_dev;
 444  444          vap->va_nodeid = ip->i_icb_lbano;
 445  445          vap->va_nlink = ip->i_nlink;
 446  446          vap->va_size = ip->i_size;
 447  447          vap->va_seq = ip->i_seq;
 448  448          if (vp->v_type == VCHR || vp->v_type == VBLK) {
 449  449                  vap->va_rdev = ip->i_rdev;
 450  450          } else {
 451  451                  vap->va_rdev = 0;
 452  452          }
 453  453  
 454  454          mutex_enter(&ip->i_tlock);
 455  455          ITIMES_NOLOCK(ip);      /* mark correct time in inode */
 456  456          vap->va_atime.tv_sec = (time_t)ip->i_atime.tv_sec;
 457  457          vap->va_atime.tv_nsec = ip->i_atime.tv_nsec;
 458  458          vap->va_mtime.tv_sec = (time_t)ip->i_mtime.tv_sec;
 459  459          vap->va_mtime.tv_nsec = ip->i_mtime.tv_nsec;
 460  460          vap->va_ctime.tv_sec = (time_t)ip->i_ctime.tv_sec;
 461  461          vap->va_ctime.tv_nsec = ip->i_ctime.tv_nsec;
 462  462          mutex_exit(&ip->i_tlock);
 463  463  
 464  464          switch (ip->i_type) {
 465  465                  case VBLK:
 466  466                          vap->va_blksize = MAXBSIZE;
 467  467                          break;
 468  468                  case VCHR:
 469  469                          vap->va_blksize = MAXBSIZE;
 470  470                          break;
 471  471                  default:
 472  472                          vap->va_blksize = ip->i_udf->udf_lbsize;
 473  473                          break;
 474  474          }
 475  475          vap->va_nblocks = ip->i_lbr << ip->i_udf->udf_l2d_shift;
 476  476  
 477  477          rw_exit(&ip->i_contents);
 478  478  
 479  479          return (0);
 480  480  }
 481  481  
 482  482  static int
 483  483  ud_iaccess_vmode(void *ip, int mode, struct cred *cr)
 484  484  {
 485  485          return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 0));
 486  486  }
 487  487  
 488  488  /*ARGSUSED4*/
 489  489  static int32_t
 490  490  udf_setattr(
 491  491          struct vnode *vp,
 492  492          struct vattr *vap,
 493  493          int32_t flags,
 494  494          struct cred *cr,
 495  495          caller_context_t *ct)
 496  496  {
 497  497          int32_t error = 0;
 498  498          uint32_t mask = vap->va_mask;
 499  499          struct ud_inode *ip;
 500  500          timestruc_t now;
 501  501          struct vattr ovap;
 502  502  
 503  503          ud_printf("udf_setattr\n");
 504  504  
 505  505          ip = VTOI(vp);
 506  506  
 507  507          /*
 508  508           * not updates allowed to 4096 files
 509  509           */
 510  510          if (ip->i_astrat == STRAT_TYPE4096) {
 511  511                  return (EINVAL);
 512  512          }
 513  513  
 514  514          /*
 515  515           * Cannot set these attributes
 516  516           */
 517  517          if (mask & AT_NOSET) {
 518  518                  return (EINVAL);
 519  519          }
 520  520  
 521  521          rw_enter(&ip->i_rwlock, RW_WRITER);
 522  522          rw_enter(&ip->i_contents, RW_WRITER);
 523  523  
 524  524          ovap.va_uid = ip->i_uid;
 525  525          ovap.va_mode = UD2VA_PERM(ip->i_perm) | ip->i_char;
 526  526          error = secpolicy_vnode_setattr(cr, vp, vap, &ovap, flags,
 527  527              ud_iaccess_vmode, ip);
 528  528          if (error)
 529  529                  goto update_inode;
 530  530  
 531  531          mask = vap->va_mask;
 532  532          /*
 533  533           * Change file access modes.
 534  534           */
 535  535          if (mask & AT_MODE) {
 536  536                  ip->i_perm = VA2UD_PERM(vap->va_mode);
 537  537                  ip->i_char = vap->va_mode & (VSUID | VSGID | VSVTX);
 538  538                  mutex_enter(&ip->i_tlock);
 539  539                  ip->i_flag |= ICHG;
 540  540                  mutex_exit(&ip->i_tlock);
 541  541          }
 542  542          if (mask & (AT_UID|AT_GID)) {
 543  543                  if (mask & AT_UID) {
 544  544                          ip->i_uid = vap->va_uid;
 545  545                  }
 546  546                  if (mask & AT_GID) {
 547  547                          ip->i_gid = vap->va_gid;
 548  548                  }
 549  549                  mutex_enter(&ip->i_tlock);
 550  550                  ip->i_flag |= ICHG;
 551  551                  mutex_exit(&ip->i_tlock);
 552  552          }
 553  553          /*
 554  554           * Truncate file.  Must have write permission and not be a directory.
 555  555           */
 556  556          if (mask & AT_SIZE) {
 557  557                  if (vp->v_type == VDIR) {
 558  558                          error = EISDIR;
 559  559                          goto update_inode;
 560  560                  }
 561  561                  if (error = ud_iaccess(ip, IWRITE, cr, 0)) {
 562  562                          goto update_inode;
 563  563                  }
 564  564                  if (vap->va_size > MAXOFFSET_T) {
 565  565                          error = EFBIG;
 566  566                          goto update_inode;
 567  567                  }
 568  568                  if (error = ud_itrunc(ip, vap->va_size, 0, cr)) {
 569  569                          goto update_inode;
 570  570                  }
 571  571  
 572  572                  if (vap->va_size == 0) {
 573  573                          vnevent_truncate(vp, ct);
 574  574                  } else {
 575  575                          vnevent_resize(vp, ct);
 576  576                  }
 577  577          }
 578  578          /*
 579  579           * Change file access or modified times.
 580  580           */
 581  581          if (mask & (AT_ATIME|AT_MTIME)) {
 582  582                  mutex_enter(&ip->i_tlock);
 583  583                  if (mask & AT_ATIME) {
 584  584                          ip->i_atime.tv_sec = vap->va_atime.tv_sec;
 585  585                          ip->i_atime.tv_nsec = vap->va_atime.tv_nsec;
 586  586                          ip->i_flag &= ~IACC;
 587  587                  }
 588  588                  if (mask & AT_MTIME) {
 589  589                          ip->i_mtime.tv_sec = vap->va_mtime.tv_sec;
 590  590                          ip->i_mtime.tv_nsec = vap->va_mtime.tv_nsec;
 591  591                          gethrestime(&now);
 592  592                          ip->i_ctime.tv_sec = now.tv_sec;
 593  593                          ip->i_ctime.tv_nsec = now.tv_nsec;
 594  594                          ip->i_flag &= ~(IUPD|ICHG);
 595  595                          ip->i_flag |= IMODTIME;
 596  596                  }
 597  597                  ip->i_flag |= IMOD;
 598  598                  mutex_exit(&ip->i_tlock);
 599  599          }
 600  600  
 601  601  update_inode:
 602  602          if (curthread->t_flag & T_DONTPEND) {
 603  603                  ud_iupdat(ip, 1);
 604  604          } else {
 605  605                  ITIMES_NOLOCK(ip);
 606  606          }
 607  607          rw_exit(&ip->i_contents);
 608  608          rw_exit(&ip->i_rwlock);
 609  609  
 610  610          return (error);
 611  611  }
 612  612  
 613  613  /* ARGSUSED */
 614  614  static int32_t
 615  615  udf_access(
 616  616          struct vnode *vp,
 617  617          int32_t mode,
 618  618          int32_t flags,
 619  619          struct cred *cr,
 620  620          caller_context_t *ct)
 621  621  {
 622  622          struct ud_inode *ip = VTOI(vp);
 623  623  
 624  624          ud_printf("udf_access\n");
 625  625  
 626  626          if (ip->i_udf == NULL) {
 627  627                  return (EIO);
 628  628          }
 629  629  
 630  630          return (ud_iaccess(ip, UD_UPERM2DPERM(mode), cr, 1));
 631  631  }
 632  632  
 633  633  int32_t udfs_stickyhack = 1;
 634  634  
 635  635  /* ARGSUSED */
 636  636  static int32_t
 637  637  udf_lookup(
 638  638          struct vnode *dvp,
 639  639          char *nm,
 640  640          struct vnode **vpp,
 641  641          struct pathname *pnp,
 642  642          int32_t flags,
 643  643          struct vnode *rdir,
 644  644          struct cred *cr,
 645  645          caller_context_t *ct,
 646  646          int *direntflags,
 647  647          pathname_t *realpnp)
 648  648  {
 649  649          int32_t error;
 650  650          struct vnode *vp;
 651  651          struct ud_inode *ip, *xip;
 652  652  
 653  653          ud_printf("udf_lookup\n");
 654  654          /*
 655  655           * Null component name is a synonym for directory being searched.
 656  656           */
 657  657          if (*nm == '\0') {
 658  658                  VN_HOLD(dvp);
 659  659                  *vpp = dvp;
 660  660                  error = 0;
 661  661                  goto out;
 662  662          }
 663  663  
 664  664          /*
 665  665           * Fast path: Check the directory name lookup cache.
 666  666           */
 667  667          ip = VTOI(dvp);
 668  668          if (vp = dnlc_lookup(dvp, nm)) {
 669  669                  /*
 670  670                   * Check accessibility of directory.
 671  671                   */
 672  672                  if ((error = ud_iaccess(ip, IEXEC, cr, 1)) != 0) {
 673  673                          VN_RELE(vp);
 674  674                  }
 675  675                  xip = VTOI(vp);
 676  676          } else {
 677  677                  error = ud_dirlook(ip, nm, &xip, cr, 1);
 678  678                  ITIMES(ip);
 679  679          }
 680  680  
 681  681          if (error == 0) {
 682  682                  ip = xip;
 683  683                  *vpp = ITOV(ip);
 684  684                  if ((ip->i_type != VDIR) &&
 685  685                      (ip->i_char & ISVTX) &&
 686  686                      ((ip->i_perm & IEXEC) == 0) &&
 687  687                      udfs_stickyhack) {
 688  688                          mutex_enter(&(*vpp)->v_lock);
 689  689                          (*vpp)->v_flag |= VISSWAP;
 690  690                          mutex_exit(&(*vpp)->v_lock);
 691  691                  }
 692  692                  ITIMES(ip);
 693  693                  /*
 694  694                   * If vnode is a device return special vnode instead.
 695  695                   */
 696  696                  if (IS_DEVVP(*vpp)) {
 697  697                          struct vnode *newvp;
 698  698                          newvp = specvp(*vpp, (*vpp)->v_rdev,
 699  699                              (*vpp)->v_type, cr);
 700  700                          VN_RELE(*vpp);
 701  701                          if (newvp == NULL) {
 702  702                                  error = ENOSYS;
 703  703                          } else {
 704  704                                  *vpp = newvp;
 705  705                          }
 706  706                  }
 707  707          }
 708  708  out:
 709  709          return (error);
 710  710  }
 711  711  
 712  712  /* ARGSUSED */
 713  713  static int32_t
 714  714  udf_create(
 715  715          struct vnode *dvp,
 716  716          char *name,
 717  717          struct vattr *vap,
 718  718          enum vcexcl excl,
 719  719          int32_t mode,
 720  720          struct vnode **vpp,
 721  721          struct cred *cr,
 722  722          int32_t flag,
 723  723          caller_context_t *ct,
 724  724          vsecattr_t *vsecp)
 725  725  {
 726  726          int32_t error;
 727  727          struct ud_inode *ip = VTOI(dvp), *xip;
 728  728  
 729  729          ud_printf("udf_create\n");
 730  730  
 731  731          if ((vap->va_mode & VSVTX) && secpolicy_vnode_stky_modify(cr) != 0)
 732  732                  vap->va_mode &= ~VSVTX;
 733  733  
 734  734          if (*name == '\0') {
 735  735                  /*
 736  736                   * Null component name refers to the directory itself.
 737  737                   */
 738  738                  VN_HOLD(dvp);
 739  739                  ITIMES(ip);
 740  740                  error = EEXIST;
 741  741          } else {
 742  742                  xip = NULL;
 743  743                  rw_enter(&ip->i_rwlock, RW_WRITER);
 744  744                  error = ud_direnter(ip, name, DE_CREATE,
 745  745                      (struct ud_inode *)0, (struct ud_inode *)0,
 746  746                      vap, &xip, cr, ct);
 747  747                  rw_exit(&ip->i_rwlock);
 748  748                  ITIMES(ip);
 749  749                  ip = xip;
 750  750          }
 751  751  #ifdef  __lock_lint
 752  752          rw_enter(&ip->i_contents, RW_WRITER);
 753  753  #else
 754  754          if (ip != NULL) {
 755  755                  rw_enter(&ip->i_contents, RW_WRITER);
 756  756          }
 757  757  #endif
 758  758  
 759  759          /*
 760  760           * If the file already exists and this is a non-exclusive create,
 761  761           * check permissions and allow access for non-directories.
 762  762           * Read-only create of an existing directory is also allowed.
 763  763           * We fail an exclusive create of anything which already exists.
 764  764           */
 765  765          if (error == EEXIST) {
 766  766                  if (excl == NONEXCL) {
 767  767                          if ((ip->i_type == VDIR) && (mode & VWRITE)) {
 768  768                                  error = EISDIR;
 769  769                          } else if (mode) {
 770  770                                  error = ud_iaccess(ip,
 771  771                                      UD_UPERM2DPERM(mode), cr, 0);
 772  772                          } else {
 773  773                                  error = 0;
 774  774                          }
 775  775                  }
 776  776                  if (error) {
 777  777                          rw_exit(&ip->i_contents);
 778  778                          VN_RELE(ITOV(ip));
 779  779                          goto out;
 780  780                  } else if ((ip->i_type == VREG) &&
 781  781                      (vap->va_mask & AT_SIZE) && vap->va_size == 0) {
 782  782                          /*
 783  783                           * Truncate regular files, if requested by caller.
 784  784                           * Grab i_rwlock to make sure no one else is
 785  785                           * currently writing to the file (we promised
 786  786                           * bmap we would do this).
 787  787                           * Must get the locks in the correct order.
 788  788                           */
 789  789                          if (ip->i_size == 0) {
 790  790                                  ip->i_flag |= ICHG | IUPD;
 791  791                          } else {
 792  792                                  rw_exit(&ip->i_contents);
 793  793                                  rw_enter(&ip->i_rwlock, RW_WRITER);
 794  794                                  rw_enter(&ip->i_contents, RW_WRITER);
 795  795                                  (void) ud_itrunc(ip, 0, 0, cr);
 796  796                                  rw_exit(&ip->i_rwlock);
 797  797                          }
 798  798                          vnevent_create(ITOV(ip), ct);
 799  799                  }
 800  800          }
 801  801  
 802  802          if (error == 0) {
 803  803                  *vpp = ITOV(ip);
 804  804                  ITIMES(ip);
 805  805          }
 806  806  #ifdef  __lock_lint
 807  807          rw_exit(&ip->i_contents);
 808  808  #else
 809  809          if (ip != NULL) {
 810  810                  rw_exit(&ip->i_contents);
 811  811          }
 812  812  #endif
 813  813          if (error) {
 814  814                  goto out;
 815  815          }
 816  816  
 817  817          /*
 818  818           * If vnode is a device return special vnode instead.
 819  819           */
 820  820          if (!error && IS_DEVVP(*vpp)) {
 821  821                  struct vnode *newvp;
 822  822  
 823  823                  newvp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
 824  824                  VN_RELE(*vpp);
 825  825                  if (newvp == NULL) {
 826  826                          error = ENOSYS;
 827  827                          goto out;
 828  828                  }
 829  829                  *vpp = newvp;
 830  830          }
 831  831  out:
 832  832          return (error);
 833  833  }
 834  834  
 835  835  /* ARGSUSED */
 836  836  static int32_t
 837  837  udf_remove(
 838  838          struct vnode *vp,
 839  839          char *nm,
 840  840          struct cred *cr,
 841  841          caller_context_t *ct,
 842  842          int flags)
 843  843  {
 844  844          int32_t error;
 845  845          struct ud_inode *ip = VTOI(vp);
 846  846  
 847  847          ud_printf("udf_remove\n");
 848  848  
 849  849          rw_enter(&ip->i_rwlock, RW_WRITER);
 850  850          error = ud_dirremove(ip, nm,
 851  851              (struct ud_inode *)0, (struct vnode *)0, DR_REMOVE, cr, ct);
 852  852          rw_exit(&ip->i_rwlock);
 853  853          ITIMES(ip);
 854  854  
 855  855          return (error);
 856  856  }
 857  857  
 858  858  /* ARGSUSED */
 859  859  static int32_t
 860  860  udf_link(
 861  861          struct vnode *tdvp,
 862  862          struct vnode *svp,
 863  863          char *tnm,
 864  864          struct cred *cr,
 865  865          caller_context_t *ct,
 866  866          int flags)
 867  867  {
 868  868          int32_t error;
 869  869          struct vnode *realvp;
 870  870          struct ud_inode *sip;
 871  871          struct ud_inode *tdp;
 872  872  
 873  873          ud_printf("udf_link\n");
 874  874          if (VOP_REALVP(svp, &realvp, ct) == 0) {
 875  875                  svp = realvp;
 876  876          }
 877  877  
 878  878          /*
 879  879           * Do not allow links to directories
 880  880           */
 881  881          if (svp->v_type == VDIR) {
 882  882                  return (EPERM);
 883  883          }
 884  884  
 885  885          sip = VTOI(svp);
 886  886  
 887  887          if (sip->i_uid != crgetuid(cr) && secpolicy_basic_link(cr) != 0)
 888  888                  return (EPERM);
 889  889  
 890  890          tdp = VTOI(tdvp);
 891  891  
 892  892          rw_enter(&tdp->i_rwlock, RW_WRITER);
 893  893          error = ud_direnter(tdp, tnm, DE_LINK, (struct ud_inode *)0,
 894  894              sip, (struct vattr *)0, (struct ud_inode **)0, cr, ct);
 895  895          rw_exit(&tdp->i_rwlock);
 896  896          ITIMES(sip);
 897  897          ITIMES(tdp);
 898  898  
 899  899          if (error == 0) {
 900  900                  vnevent_link(svp, ct);
 901  901          }
 902  902  
 903  903          return (error);
 904  904  }
 905  905  
 906  906  /* ARGSUSED */
 907  907  static int32_t
 908  908  udf_rename(
 909  909          struct vnode *sdvp,
 910  910          char *snm,
 911  911          struct vnode *tdvp,
 912  912          char *tnm,
 913  913          struct cred *cr,
 914  914          caller_context_t *ct,
 915  915          int flags)
 916  916  {
 917  917          int32_t error = 0;
 918  918          struct udf_vfs *udf_vfsp;
 919  919          struct ud_inode *sip;           /* source inode */
 920  920          struct ud_inode *tip;           /* target inode */
 921  921          struct ud_inode *sdp, *tdp;     /* source and target parent inode */
 922  922          struct vnode *realvp;
 923  923  
 924  924          ud_printf("udf_rename\n");
 925  925  
 926  926          if (VOP_REALVP(tdvp, &realvp, ct) == 0) {
 927  927                  tdvp = realvp;
 928  928          }
 929  929  
 930  930          sdp = VTOI(sdvp);
 931  931          tdp = VTOI(tdvp);
 932  932  
 933  933          udf_vfsp = sdp->i_udf;
 934  934  
 935  935          mutex_enter(&udf_vfsp->udf_rename_lck);
 936  936          /*
 937  937           * Look up inode of file we're supposed to rename.
 938  938           */
 939  939          if (error = ud_dirlook(sdp, snm, &sip, cr, 0)) {
 940  940                  mutex_exit(&udf_vfsp->udf_rename_lck);
 941  941                  return (error);
 942  942          }
 943  943          /*
 944  944           * be sure this is not a directory with another file system mounted
 945  945           * over it.  If it is just give up the locks, and return with
 946  946           * EBUSY
 947  947           */
 948  948          if (vn_mountedvfs(ITOV(sip)) != NULL) {
 949  949                  error = EBUSY;
 950  950                  goto errout;
 951  951          }
 952  952          /*
 953  953           * Make sure we can delete the source entry.  This requires
 954  954           * write permission on the containing directory.  If that
 955  955           * directory is "sticky" it further requires (except for
 956  956           * privileged users) that the user own the directory or the
 957  957           * source entry, or else have permission to write the source
 958  958           * entry.
 959  959           */
 960  960          rw_enter(&sdp->i_contents, RW_READER);
 961  961          rw_enter(&sip->i_contents, RW_READER);
 962  962          if ((error = ud_iaccess(sdp, IWRITE, cr, 0)) != 0 ||
 963  963              (error = ud_sticky_remove_access(sdp, sip, cr)) != 0) {
 964  964                  rw_exit(&sip->i_contents);
 965  965                  rw_exit(&sdp->i_contents);
 966  966                  ITIMES(sip);
 967  967                  goto errout;
 968  968          }
 969  969  
 970  970          /*
 971  971           * Check for renaming '.' or '..' or alias of '.'
 972  972           */
 973  973          if ((strcmp(snm, ".") == 0) ||
 974  974              (strcmp(snm, "..") == 0) ||
 975  975              (sdp == sip)) {
 976  976                  error = EINVAL;
 977  977                  rw_exit(&sip->i_contents);
 978  978                  rw_exit(&sdp->i_contents);
 979  979                  goto errout;
 980  980          }
 981  981  
 982  982          rw_exit(&sip->i_contents);
 983  983          rw_exit(&sdp->i_contents);
 984  984  
 985  985          if (ud_dirlook(tdp, tnm, &tip, cr, 0) == 0) {
 986  986                  vnevent_pre_rename_dest(ITOV(tip), tdvp, tnm, ct);
 987  987                  VN_RELE(ITOV(tip));
 988  988          }
 989  989  
 990  990          /* Notify the target dir. if not the same as the source dir. */
 991  991          if (sdvp != tdvp)
 992  992                  vnevent_pre_rename_dest_dir(tdvp, ITOV(sip), tnm, ct);
 993  993  
 994  994          vnevent_pre_rename_src(ITOV(sip), sdvp, snm, ct);
 995  995  
 996  996          /*
 997  997           * Link source to the target.
 998  998           */
 999  999          rw_enter(&tdp->i_rwlock, RW_WRITER);
1000 1000          if (error = ud_direnter(tdp, tnm, DE_RENAME, sdp, sip,
1001 1001              (struct vattr *)0, (struct ud_inode **)0, cr, ct)) {
1002 1002                  /*
1003 1003                   * ESAME isn't really an error; it indicates that the
1004 1004                   * operation should not be done because the source and target
1005 1005                   * are the same file, but that no error should be reported.
1006 1006                   */
1007 1007                  if (error == ESAME) {
1008 1008                          error = 0;
1009 1009                  }
1010 1010                  rw_exit(&tdp->i_rwlock);
1011 1011                  goto errout;
1012 1012          }
1013 1013          rw_exit(&tdp->i_rwlock);
1014 1014  
1015 1015          rw_enter(&sdp->i_rwlock, RW_WRITER);
1016 1016          /*
1017 1017           * Unlink the source.
1018 1018           * Remove the source entry.  ud_dirremove() checks that the entry
1019 1019           * still reflects sip, and returns an error if it doesn't.
1020 1020           * If the entry has changed just forget about it.  Release
1021 1021           * the source inode.
1022 1022           */
1023 1023          if ((error = ud_dirremove(sdp, snm, sip, (struct vnode *)0,
1024 1024              DR_RENAME, cr, ct)) == ENOENT) {
1025 1025                  error = 0;
1026 1026          }
1027 1027          rw_exit(&sdp->i_rwlock);
1028 1028  
1029 1029          if (error == 0) {
1030 1030                  vnevent_rename_src(ITOV(sip), sdvp, snm, ct);
1031 1031                  /*
1032 1032                   * vnevent_rename_dest and vnevent_rename_dest_dir are called
1033 1033                   * in ud_direnter().
1034 1034                   */
1035 1035          }
1036 1036  
1037 1037  errout:
1038 1038          ITIMES(sdp);
1039 1039          ITIMES(tdp);
1040 1040          VN_RELE(ITOV(sip));
1041 1041          mutex_exit(&udf_vfsp->udf_rename_lck);
1042 1042  
1043 1043          return (error);
1044 1044  }
1045 1045  
1046 1046  /* ARGSUSED */
1047 1047  static int32_t
1048 1048  udf_mkdir(
1049 1049          struct vnode *dvp,
1050 1050          char *dirname,
1051 1051          struct vattr *vap,
1052 1052          struct vnode **vpp,
1053 1053          struct cred *cr,
1054 1054          caller_context_t *ct,
1055 1055          int flags,
1056 1056          vsecattr_t *vsecp)
1057 1057  {
1058 1058          int32_t error;
1059 1059          struct ud_inode *ip;
1060 1060          struct ud_inode *xip;
1061 1061  
1062 1062          ASSERT((vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
1063 1063  
1064 1064          ud_printf("udf_mkdir\n");
1065 1065  
1066 1066          ip = VTOI(dvp);
1067 1067          rw_enter(&ip->i_rwlock, RW_WRITER);
1068 1068          error = ud_direnter(ip, dirname, DE_MKDIR,
1069 1069              (struct ud_inode *)0, (struct ud_inode *)0, vap, &xip, cr, ct);
1070 1070          rw_exit(&ip->i_rwlock);
1071 1071          ITIMES(ip);
1072 1072          if (error == 0) {
1073 1073                  ip = xip;
1074 1074                  *vpp = ITOV(ip);
1075 1075                  ITIMES(ip);
1076 1076          } else if (error == EEXIST) {
1077 1077                  ITIMES(xip);
1078 1078                  VN_RELE(ITOV(xip));
1079 1079          }
1080 1080  
1081 1081          return (error);
1082 1082  }
1083 1083  
1084 1084  /* ARGSUSED */
1085 1085  static int32_t
1086 1086  udf_rmdir(
1087 1087          struct vnode *vp,
1088 1088          char *nm,
1089 1089          struct vnode *cdir,
1090 1090          struct cred *cr,
1091 1091          caller_context_t *ct,
1092 1092          int flags)
1093 1093  {
1094 1094          int32_t error;
1095 1095          struct ud_inode *ip = VTOI(vp);
1096 1096  
1097 1097          ud_printf("udf_rmdir\n");
1098 1098  
1099 1099          rw_enter(&ip->i_rwlock, RW_WRITER);
1100 1100          error = ud_dirremove(ip, nm, (struct ud_inode *)0, cdir, DR_RMDIR,
1101 1101              cr, ct);
1102 1102          rw_exit(&ip->i_rwlock);
1103 1103          ITIMES(ip);
1104 1104  
1105 1105          return (error);
1106 1106  }
1107 1107  
1108 1108  /* ARGSUSED */
1109 1109  static int32_t
1110 1110  udf_readdir(
1111 1111          struct vnode *vp,
1112 1112          struct uio *uiop,
1113 1113          struct cred *cr,
1114 1114          int32_t *eofp,
1115 1115          caller_context_t *ct,
1116 1116          int flags)
1117 1117  {
1118 1118          struct ud_inode *ip;
1119 1119          struct dirent64 *nd;
1120 1120          struct udf_vfs *udf_vfsp;
1121 1121          int32_t error = 0, len, outcount = 0;
1122 1122          uint32_t dirsiz, offset;
1123 1123          uint32_t bufsize, ndlen, dummy;
1124 1124          caddr_t outbuf;
1125 1125          caddr_t outb, end_outb;
1126 1126          struct iovec *iovp;
1127 1127  
1128 1128          uint8_t *dname;
1129 1129          int32_t length;
1130 1130  
1131 1131          uint8_t *buf = NULL;
1132 1132  
1133 1133          struct fbuf *fbp = NULL;
1134 1134          struct file_id *fid;
1135 1135          uint8_t *name;
1136 1136  
1137 1137  
1138 1138          ud_printf("udf_readdir\n");
1139 1139  
1140 1140          ip = VTOI(vp);
1141 1141          udf_vfsp = ip->i_udf;
1142 1142  
1143 1143          dirsiz = ip->i_size;
1144 1144          if ((uiop->uio_offset >= dirsiz) ||
1145 1145              (ip->i_nlink <= 0)) {
1146 1146                  if (eofp) {
1147 1147                          *eofp = 1;
1148 1148                  }
1149 1149                  return (0);
1150 1150          }
1151 1151  
1152 1152          offset = uiop->uio_offset;
1153 1153          iovp = uiop->uio_iov;
1154 1154          bufsize = iovp->iov_len;
1155 1155  
1156 1156          outb = outbuf = (char *)kmem_alloc((uint32_t)bufsize, KM_SLEEP);
1157 1157          end_outb = outb + bufsize;
1158 1158          nd = (struct dirent64 *)outbuf;
1159 1159  
1160 1160          dname = (uint8_t *)kmem_zalloc(1024, KM_SLEEP);
1161 1161          buf = (uint8_t *)kmem_zalloc(udf_vfsp->udf_lbsize, KM_SLEEP);
1162 1162  
1163 1163          if (offset == 0) {
1164 1164                  len = DIRENT64_RECLEN(1);
1165 1165                  if (((caddr_t)nd + len) >= end_outb) {
1166 1166                          error = EINVAL;
1167 1167                          goto end;
1168 1168                  }
1169 1169                  nd->d_ino = ip->i_icb_lbano;
1170 1170                  nd->d_reclen = (uint16_t)len;
1171 1171                  nd->d_off = 0x10;
1172 1172                  nd->d_name[0] = '.';
1173 1173                  bzero(&nd->d_name[1], DIRENT64_NAMELEN(len) - 1);
1174 1174                  nd = (struct dirent64 *)((char *)nd + nd->d_reclen);
1175 1175                  outcount++;
1176 1176          } else if (offset == 0x10) {
1177 1177                  offset = 0;
1178 1178          }
1179 1179  
1180 1180          while (offset < dirsiz) {
1181 1181                  error = ud_get_next_fid(ip, &fbp,
1182 1182                      offset, &fid, &name, buf);
1183 1183                  if (error != 0) {
1184 1184                          break;
1185 1185                  }
1186 1186  
1187 1187                  if ((fid->fid_flags & FID_DELETED) == 0) {
1188 1188                          if (fid->fid_flags & FID_PARENT) {
1189 1189  
1190 1190                                  len = DIRENT64_RECLEN(2);
1191 1191                                  if (((caddr_t)nd + len) >= end_outb) {
1192 1192                                          error = EINVAL;
1193 1193                                          break;
1194 1194                                  }
1195 1195  
1196 1196                                  nd->d_ino = ip->i_icb_lbano;
1197 1197                                  nd->d_reclen = (uint16_t)len;
1198 1198                                  nd->d_off = offset + FID_LEN(fid);
1199 1199                                  nd->d_name[0] = '.';
1200 1200                                  nd->d_name[1] = '.';
1201 1201                                  bzero(&nd->d_name[2],
1202 1202                                      DIRENT64_NAMELEN(len) - 2);
1203 1203                                  nd = (struct dirent64 *)
1204 1204                                      ((char *)nd + nd->d_reclen);
1205 1205                          } else {
1206 1206                                  if ((error = ud_uncompress(fid->fid_idlen,
1207 1207                                      &length, name, dname)) != 0) {
1208 1208                                          break;
1209 1209                                  }
1210 1210                                  if (length == 0) {
1211 1211                                          offset += FID_LEN(fid);
1212 1212                                          continue;
1213 1213                                  }
1214 1214                                  len = DIRENT64_RECLEN(length);
1215 1215                                  if (((caddr_t)nd + len) >= end_outb) {
1216 1216                                          if (!outcount) {
1217 1217                                                  error = EINVAL;
1218 1218                                          }
1219 1219                                          break;
1220 1220                                  }
1221 1221                                  (void) strncpy(nd->d_name,
1222 1222                                      (caddr_t)dname, length);
1223 1223                                  bzero(&nd->d_name[length],
1224 1224                                      DIRENT64_NAMELEN(len) - length);
1225 1225                                  nd->d_ino = ud_xlate_to_daddr(udf_vfsp,
1226 1226                                      SWAP_16(fid->fid_icb.lad_ext_prn),
1227 1227                                      SWAP_32(fid->fid_icb.lad_ext_loc), 1,
1228 1228                                      &dummy);
1229 1229                                  nd->d_reclen = (uint16_t)len;
1230 1230                                  nd->d_off = offset + FID_LEN(fid);
1231 1231                                  nd = (struct dirent64 *)
1232 1232                                      ((char *)nd + nd->d_reclen);
1233 1233                          }
1234 1234                          outcount++;
1235 1235                  }
1236 1236  
1237 1237                  offset += FID_LEN(fid);
1238 1238          }
1239 1239  
1240 1240  end:
1241 1241          if (fbp != NULL) {
1242 1242                  fbrelse(fbp, S_OTHER);
1243 1243          }
1244 1244          ndlen = ((char *)nd - outbuf);
1245 1245          /*
1246 1246           * In case of error do not call uiomove.
1247 1247           * Return the error to the caller.
1248 1248           */
1249 1249          if ((error == 0) && (ndlen != 0)) {
1250 1250                  error = uiomove(outbuf, (long)ndlen, UIO_READ, uiop);
1251 1251                  uiop->uio_offset = offset;
1252 1252          }
1253 1253          kmem_free((caddr_t)buf, udf_vfsp->udf_lbsize);
1254 1254          kmem_free((caddr_t)dname, 1024);
1255 1255          kmem_free(outbuf, (uint32_t)bufsize);
1256 1256          if (eofp && error == 0) {
1257 1257                  *eofp = (uiop->uio_offset >= dirsiz);
1258 1258          }
1259 1259          return (error);
1260 1260  }
1261 1261  
1262 1262  /* ARGSUSED */
1263 1263  static int32_t
1264 1264  udf_symlink(
1265 1265          struct vnode *dvp,
1266 1266          char *linkname,
1267 1267          struct vattr *vap,
1268 1268          char *target,
1269 1269          struct cred *cr,
1270 1270          caller_context_t *ct,
1271 1271          int flags)
1272 1272  {
1273 1273          int32_t error = 0, outlen;
1274 1274          uint32_t ioflag = 0;
1275 1275          struct ud_inode *ip, *dip = VTOI(dvp);
1276 1276  
1277 1277          struct path_comp *pc;
1278 1278          int8_t *dname = NULL, *uname = NULL, *sp;
1279 1279  
1280 1280          ud_printf("udf_symlink\n");
1281 1281  
1282 1282          ip = (struct ud_inode *)0;
1283 1283          vap->va_type = VLNK;
1284 1284          vap->va_rdev = 0;
1285 1285  
1286 1286          rw_enter(&dip->i_rwlock, RW_WRITER);
1287 1287          error = ud_direnter(dip, linkname, DE_CREATE,
1288 1288              (struct ud_inode *)0, (struct ud_inode *)0, vap, &ip, cr, ct);
1289 1289          rw_exit(&dip->i_rwlock);
1290 1290          if (error == 0) {
1291 1291                  dname = kmem_zalloc(1024, KM_SLEEP);
1292 1292                  uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1293 1293  
1294 1294                  pc = (struct path_comp *)uname;
1295 1295                  /*
1296 1296                   * If the first character in target is "/"
1297 1297                   * then skip it and create entry for it
1298 1298                   */
1299 1299                  if (*target == '/') {
1300 1300                          pc->pc_type = 2;
1301 1301                          pc->pc_len = 0;
1302 1302                          pc = (struct path_comp *)(((char *)pc) + 4);
1303 1303                          while (*target == '/') {
1304 1304                                  target++;
1305 1305                          }
1306 1306                  }
1307 1307  
1308 1308                  while (*target != NULL) {
1309 1309                          sp = target;
1310 1310                          while ((*target != '/') && (*target != '\0')) {
1311 1311                                  target ++;
1312 1312                          }
1313 1313                          /*
1314 1314                           * We got the next component of the
1315 1315                           * path name. Create path_comp of
1316 1316                           * appropriate type
1317 1317                           */
1318 1318                          if (((target - sp) == 1) && (*sp == '.')) {
1319 1319                                  /*
1320 1320                                   * Dot entry.
1321 1321                                   */
1322 1322                                  pc->pc_type = 4;
1323 1323                                  pc = (struct path_comp *)(((char *)pc) + 4);
1324 1324                          } else if (((target - sp) == 2) &&
1325 1325                              (*sp == '.') && ((*(sp + 1)) == '.')) {
1326 1326                                  /*
1327 1327                                   * DotDot entry.
1328 1328                                   */
1329 1329                                  pc->pc_type = 3;
1330 1330                                  pc = (struct path_comp *)(((char *)pc) + 4);
1331 1331                          } else {
1332 1332                                  /*
1333 1333                                   * convert the user given name
1334 1334                                   * into appropriate form to be put
1335 1335                                   * on the media
1336 1336                                   */
1337 1337                                  outlen = 1024;  /* set to size of dname */
1338 1338                                  if (error = ud_compress(target - sp, &outlen,
1339 1339                                      (uint8_t *)sp, (uint8_t *)dname)) {
1340 1340                                          break;
1341 1341                                  }
1342 1342                                  pc->pc_type = 5;
1343 1343                                  /* LINTED */
1344 1344                                  pc->pc_len = outlen;
1345 1345                                  dname[outlen] = '\0';
1346 1346                                  (void) strcpy((char *)pc->pc_id, dname);
1347 1347                                  pc = (struct path_comp *)
1348 1348                                      (((char *)pc) + 4 + outlen);
1349 1349                          }
1350 1350                          while (*target == '/') {
1351 1351                                  target++;
1352 1352                          }
1353 1353                          if (*target == NULL) {
1354 1354                                  break;
1355 1355                          }
1356 1356                  }
1357 1357  
1358 1358                  rw_enter(&ip->i_contents, RW_WRITER);
1359 1359                  if (error == 0) {
1360 1360                          ioflag = FWRITE;
1361 1361                          if (curthread->t_flag & T_DONTPEND) {
1362 1362                                  ioflag |= FDSYNC;
1363 1363                          }
1364 1364                          error = ud_rdwri(UIO_WRITE, ioflag, ip,
1365 1365                              uname, ((int8_t *)pc) - uname,
1366 1366                              (offset_t)0, UIO_SYSSPACE, (int32_t *)0, cr);
1367 1367                  }
1368 1368                  if (error) {
1369 1369                          ud_idrop(ip);
1370 1370                          rw_exit(&ip->i_contents);
1371 1371                          rw_enter(&dip->i_rwlock, RW_WRITER);
1372 1372                          (void) ud_dirremove(dip, linkname, (struct ud_inode *)0,
1373 1373                              (struct vnode *)0, DR_REMOVE, cr, ct);
1374 1374                          rw_exit(&dip->i_rwlock);
1375 1375                          goto update_inode;
1376 1376                  }
1377 1377                  rw_exit(&ip->i_contents);
1378 1378          }
1379 1379  
1380 1380          if ((error == 0) || (error == EEXIST)) {
1381 1381                  VN_RELE(ITOV(ip));
1382 1382          }
1383 1383  
1384 1384  update_inode:
1385 1385          ITIMES(VTOI(dvp));
1386 1386          if (uname != NULL) {
1387 1387                  kmem_free(uname, PAGESIZE);
1388 1388          }
1389 1389          if (dname != NULL) {
1390 1390                  kmem_free(dname, 1024);
1391 1391          }
1392 1392  
1393 1393          return (error);
1394 1394  }
1395 1395  
1396 1396  /* ARGSUSED */
1397 1397  static int32_t
1398 1398  udf_readlink(
1399 1399          struct vnode *vp,
1400 1400          struct uio *uiop,
1401 1401          struct cred *cr,
1402 1402          caller_context_t *ct)
1403 1403  {
1404 1404          int32_t error = 0, off, id_len, size, len;
1405 1405          int8_t *dname = NULL, *uname = NULL;
1406 1406          struct ud_inode *ip;
1407 1407          struct fbuf *fbp = NULL;
1408 1408          struct path_comp *pc;
1409 1409  
1410 1410          ud_printf("udf_readlink\n");
1411 1411  
1412 1412          if (vp->v_type != VLNK) {
1413 1413                  return (EINVAL);
1414 1414          }
1415 1415  
1416 1416          ip = VTOI(vp);
1417 1417          size = ip->i_size;
1418 1418          if (size > PAGESIZE) {
1419 1419                  return (EIO);
1420 1420          }
1421 1421  
1422 1422          if (size == 0) {
1423 1423                  return (0);
1424 1424          }
1425 1425  
1426 1426          dname = kmem_zalloc(1024, KM_SLEEP);
1427 1427          uname = kmem_zalloc(PAGESIZE, KM_SLEEP);
1428 1428  
1429 1429          rw_enter(&ip->i_contents, RW_READER);
1430 1430  
1431 1431          if ((error = fbread(vp, 0, size, S_READ, &fbp)) != 0) {
1432 1432                  goto end;
1433 1433          }
1434 1434  
1435 1435          off = 0;
1436 1436  
1437 1437          while (off < size) {
1438 1438                  pc = (struct path_comp *)(fbp->fb_addr + off);
1439 1439                  switch (pc->pc_type) {
1440 1440                          case 1 :
1441 1441                                  (void) strcpy(uname, ip->i_udf->udf_fsmnt);
1442 1442                                  (void) strcat(uname, "/");
1443 1443                                  break;
1444 1444                          case 2 :
1445 1445                                  if (pc->pc_len != 0) {
1446 1446                                          goto end;
1447 1447                                  }
1448 1448                                  uname[0] = '/';
1449 1449                                  uname[1] = '\0';
1450 1450                                  break;
1451 1451                          case 3 :
1452 1452                                  (void) strcat(uname, "../");
1453 1453                                  break;
1454 1454                          case 4 :
1455 1455                                  (void) strcat(uname, "./");
1456 1456                                  break;
1457 1457                          case 5 :
1458 1458                                  if ((error = ud_uncompress(pc->pc_len, &id_len,
1459 1459                                      pc->pc_id, (uint8_t *)dname)) != 0) {
1460 1460                                          break;
1461 1461                                  }
1462 1462                                  dname[id_len] = '\0';
1463 1463                                  (void) strcat(uname, dname);
1464 1464                                  (void) strcat(uname, "/");
1465 1465                                  break;
1466 1466                          default :
1467 1467                                  error = EINVAL;
1468 1468                                  goto end;
1469 1469                  }
1470 1470                  off += 4 + pc->pc_len;
1471 1471          }
1472 1472          len = strlen(uname) - 1;
1473 1473          if (uname[len] == '/') {
1474 1474                  if (len == 0) {
1475 1475                          /*
1476 1476                           * special case link to /
1477 1477                           */
1478 1478                          len = 1;
1479 1479                  } else {
1480 1480                          uname[len] = '\0';
1481 1481                  }
1482 1482          }
1483 1483  
1484 1484          error = uiomove(uname, len, UIO_READ, uiop);
1485 1485  
1486 1486          ITIMES(ip);
1487 1487  
1488 1488  end:
1489 1489          if (fbp != NULL) {
1490 1490                  fbrelse(fbp, S_OTHER);
1491 1491          }
1492 1492          rw_exit(&ip->i_contents);
1493 1493          if (uname != NULL) {
1494 1494                  kmem_free(uname, PAGESIZE);
1495 1495          }
1496 1496          if (dname != NULL) {
1497 1497                  kmem_free(dname, 1024);
1498 1498          }
1499 1499          return (error);
1500 1500  }
1501 1501  
1502 1502  /* ARGSUSED */
1503 1503  static int32_t
1504 1504  udf_fsync(
1505 1505          struct vnode *vp,
1506 1506          int32_t syncflag,
1507 1507          struct cred *cr,
1508 1508          caller_context_t *ct)
1509 1509  {
1510 1510          int32_t error = 0;
1511 1511          struct ud_inode *ip = VTOI(vp);
1512 1512  
1513 1513          ud_printf("udf_fsync\n");
1514 1514  
1515 1515          rw_enter(&ip->i_contents, RW_WRITER);
1516 1516          if (!(IS_SWAPVP(vp))) {
1517 1517                  error = ud_syncip(ip, 0, I_SYNC); /* Do synchronous writes */
1518 1518          }
1519 1519          if (error == 0) {
1520 1520                  error = ud_sync_indir(ip);
1521 1521          }
1522 1522          ITIMES(ip);             /* XXX: is this necessary ??? */
1523 1523          rw_exit(&ip->i_contents);
1524 1524  
1525 1525          return (error);
1526 1526  }
1527 1527  
1528 1528  /* ARGSUSED */
1529 1529  static void
1530 1530  udf_inactive(struct vnode *vp, struct cred *cr, caller_context_t *ct)
1531 1531  {
1532 1532          ud_printf("udf_iinactive\n");
1533 1533  
1534 1534          ud_iinactive(VTOI(vp), cr);
1535 1535  }
1536 1536  
1537 1537  /* ARGSUSED */
1538 1538  static int32_t
1539 1539  udf_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
1540 1540  {
1541 1541          struct udf_fid *udfidp;
1542 1542          struct ud_inode *ip = VTOI(vp);
1543 1543  
1544 1544          ud_printf("udf_fid\n");
1545 1545  
1546 1546          if (fidp->fid_len < (sizeof (struct udf_fid) - sizeof (uint16_t))) {
1547 1547                  fidp->fid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1548 1548                  return (ENOSPC);
1549 1549          }
1550 1550  
1551 1551          udfidp = (struct udf_fid *)fidp;
1552 1552          bzero((char *)udfidp, sizeof (struct udf_fid));
1553 1553          rw_enter(&ip->i_contents, RW_READER);
1554 1554          udfidp->udfid_len = sizeof (struct udf_fid) - sizeof (uint16_t);
1555 1555          udfidp->udfid_uinq_lo = ip->i_uniqid & 0xffffffff;
1556 1556          udfidp->udfid_prn = ip->i_icb_prn;
1557 1557          udfidp->udfid_icb_lbn = ip->i_icb_block;
1558 1558          rw_exit(&ip->i_contents);
1559 1559  
1560 1560          return (0);
1561 1561  }
1562 1562  
1563 1563  /* ARGSUSED2 */
1564 1564  static int
1565 1565  udf_rwlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1566 1566  {
1567 1567          struct ud_inode *ip = VTOI(vp);
1568 1568  
1569 1569          ud_printf("udf_rwlock\n");
1570 1570  
1571 1571          if (write_lock) {
1572 1572                  rw_enter(&ip->i_rwlock, RW_WRITER);
1573 1573          } else {
1574 1574                  rw_enter(&ip->i_rwlock, RW_READER);
1575 1575          }
1576 1576  #ifdef  __lock_lint
1577 1577          rw_exit(&ip->i_rwlock);
1578 1578  #endif
1579 1579          return (write_lock);
1580 1580  }
1581 1581  
1582 1582  /* ARGSUSED */
1583 1583  static void
1584 1584  udf_rwunlock(struct vnode *vp, int32_t write_lock, caller_context_t *ctp)
1585 1585  {
1586 1586          struct ud_inode *ip = VTOI(vp);
1587 1587  
1588 1588          ud_printf("udf_rwunlock\n");
1589 1589  
1590 1590  #ifdef  __lock_lint
1591 1591          rw_enter(&ip->i_rwlock, RW_WRITER);
1592 1592  #endif
1593 1593  
1594 1594          rw_exit(&ip->i_rwlock);
1595 1595  
1596 1596  }
1597 1597  
1598 1598  /* ARGSUSED */
1599 1599  static int32_t
1600 1600  udf_seek(struct vnode *vp, offset_t ooff, offset_t *noffp, caller_context_t *ct)
1601 1601  {
1602 1602          return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
1603 1603  }
1604 1604  
1605 1605  static int32_t
1606 1606  udf_frlock(
1607 1607          struct vnode *vp,
1608 1608          int32_t cmd,
1609 1609          struct flock64 *bfp,
1610 1610          int32_t flag,
1611 1611          offset_t offset,
1612 1612          struct flk_callback *flk_cbp,
1613 1613          cred_t *cr,
1614 1614          caller_context_t *ct)
1615 1615  {
1616 1616          struct ud_inode *ip = VTOI(vp);
1617 1617  
1618 1618          ud_printf("udf_frlock\n");
1619 1619  
1620 1620          /*
1621 1621           * If file is being mapped, disallow frlock.
1622 1622           * XXX I am not holding tlock while checking i_mapcnt because the
1623 1623           * current locking strategy drops all locks before calling fs_frlock.
1624 1624           * So, mapcnt could change before we enter fs_frlock making is
1625 1625           * meaningless to have held tlock in the first place.
1626 1626           */
1627 1627          if ((ip->i_mapcnt > 0) &&
1628 1628              (MANDLOCK(vp, ip->i_char))) {
1629 1629                  return (EAGAIN);
1630 1630          }
1631 1631  
1632 1632          return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
1633 1633  }
1634 1634  
1635 1635  /*ARGSUSED6*/
1636 1636  static int32_t
1637 1637  udf_space(
1638 1638          struct vnode *vp,
1639 1639          int32_t cmd,
1640 1640          struct flock64 *bfp,
1641 1641          int32_t flag,
1642 1642          offset_t offset,
1643 1643          cred_t *cr,
1644 1644          caller_context_t *ct)
1645 1645  {
1646 1646          int32_t error = 0;
1647 1647  
1648 1648          ud_printf("udf_space\n");
1649 1649  
1650 1650          if (cmd != F_FREESP) {
1651 1651                  error =  EINVAL;
1652 1652          } else if ((error = convoff(vp, bfp, 0, offset)) == 0) {
1653 1653                  error = ud_freesp(vp, bfp, flag, cr);
1654 1654  
1655 1655                  if (error == 0) {
1656 1656                          if (bfp->l_start == 0) {
1657 1657                                  vnevent_truncate(vp, ct);
1658 1658                          } else {
1659 1659                                  vnevent_resize(vp, ct);
1660 1660                          }
1661 1661                  }
1662 1662          }
1663 1663  
1664 1664          return (error);
1665 1665  }
1666 1666  
1667 1667  /* ARGSUSED */
1668 1668  static int32_t
1669 1669  udf_getpage(
1670 1670          struct vnode *vp,
1671 1671          offset_t off,
1672 1672          size_t len,
1673 1673          uint32_t *protp,
1674 1674          struct page **plarr,
1675 1675          size_t plsz,
1676 1676          struct seg *seg,
1677 1677          caddr_t addr,
1678 1678          enum seg_rw rw,
1679 1679          struct cred *cr,
1680 1680          caller_context_t *ct)
1681 1681  {
1682 1682          struct ud_inode *ip = VTOI(vp);
1683 1683          int32_t error, has_holes, beyond_eof, seqmode, dolock;
1684 1684          int32_t pgsize = PAGESIZE;
1685 1685          struct udf_vfs *udf_vfsp = ip->i_udf;
1686 1686          page_t **pl;
1687 1687          u_offset_t pgoff, eoff, uoff;
1688 1688          krw_t rwtype;
1689 1689          caddr_t pgaddr;
1690 1690  
1691 1691          ud_printf("udf_getpage\n");
1692 1692  
1693 1693          uoff = (u_offset_t)off; /* type conversion */
1694 1694          if (protp) {
1695 1695                  *protp = PROT_ALL;
1696 1696          }
1697 1697          if (vp->v_flag & VNOMAP) {
1698 1698                  return (ENOSYS);
1699 1699          }
1700 1700          seqmode = ip->i_nextr == uoff && rw != S_CREATE;
1701 1701  
1702 1702          rwtype = RW_READER;
1703 1703          dolock = (rw_owner(&ip->i_contents) != curthread);
1704 1704  retrylock:
1705 1705  #ifdef  __lock_lint
1706 1706          rw_enter(&ip->i_contents, rwtype);
1707 1707  #else
1708 1708          if (dolock) {
1709 1709                  rw_enter(&ip->i_contents, rwtype);
1710 1710          }
1711 1711  #endif
1712 1712  
1713 1713          /*
1714 1714           * We may be getting called as a side effect of a bmap using
1715 1715           * fbread() when the blocks might be being allocated and the
1716 1716           * size has not yet been up'ed.  In this case we want to be
1717 1717           * able to return zero pages if we get back UDF_HOLE from
1718 1718           * calling bmap for a non write case here.  We also might have
1719 1719           * to read some frags from the disk into a page if we are
1720 1720           * extending the number of frags for a given lbn in bmap().
1721 1721           */
1722 1722          beyond_eof = uoff + len > ip->i_size + PAGEOFFSET;
1723 1723          if (beyond_eof && seg != segkmap) {
1724 1724  #ifdef  __lock_lint
1725 1725                  rw_exit(&ip->i_contents);
1726 1726  #else
1727 1727                  if (dolock) {
1728 1728                          rw_exit(&ip->i_contents);
1729 1729                  }
1730 1730  #endif
1731 1731                  return (EFAULT);
1732 1732          }
1733 1733  
1734 1734          /*
1735 1735           * Must hold i_contents lock throughout the call to pvn_getpages
1736 1736           * since locked pages are returned from each call to ud_getapage.
1737 1737           * Must *not* return locked pages and then try for contents lock
1738 1738           * due to lock ordering requirements (inode > page)
1739 1739           */
1740 1740  
1741 1741          has_holes = ud_bmap_has_holes(ip);
1742 1742  
1743 1743          if ((rw == S_WRITE || rw == S_CREATE) && (has_holes || beyond_eof)) {
1744 1744                  int32_t blk_size, count;
1745 1745                  u_offset_t offset;
1746 1746  
1747 1747                  /*
1748 1748                   * We must acquire the RW_WRITER lock in order to
1749 1749                   * call bmap_write().
1750 1750                   */
1751 1751                  if (dolock && rwtype == RW_READER) {
1752 1752                          rwtype = RW_WRITER;
1753 1753  
1754 1754                          if (!rw_tryupgrade(&ip->i_contents)) {
1755 1755  
1756 1756                                  rw_exit(&ip->i_contents);
1757 1757  
1758 1758                                  goto retrylock;
1759 1759                          }
1760 1760                  }
1761 1761  
1762 1762                  /*
1763 1763                   * May be allocating disk blocks for holes here as
1764 1764                   * a result of mmap faults. write(2) does the bmap_write
1765 1765                   * in rdip/wrip, not here. We are not dealing with frags
1766 1766                   * in this case.
1767 1767                   */
1768 1768                  offset = uoff;
1769 1769                  while ((offset < uoff + len) &&
1770 1770                      (offset < ip->i_size)) {
1771 1771                          /*
1772 1772                           * the variable "bnp" is to simplify the expression for
1773 1773                           * the compiler; * just passing in &bn to bmap_write
1774 1774                           * causes a compiler "loop"
1775 1775                           */
1776 1776  
1777 1777                          blk_size = udf_vfsp->udf_lbsize;
1778 1778                          if ((offset + blk_size) > ip->i_size) {
1779 1779                                  count = ip->i_size - offset;
1780 1780                          } else {
1781 1781                                  count = blk_size;
1782 1782                          }
1783 1783                          error = ud_bmap_write(ip, offset, count, 0, cr);
1784 1784                          if (error) {
1785 1785                                  goto update_inode;
1786 1786                          }
1787 1787                          offset += count; /* XXX - make this contig */
1788 1788                  }
1789 1789          }
1790 1790  
1791 1791          /*
1792 1792           * Can be a reader from now on.
1793 1793           */
1794 1794  #ifdef  __lock_lint
1795 1795          if (rwtype == RW_WRITER) {
1796 1796                  rw_downgrade(&ip->i_contents);
1797 1797          }
1798 1798  #else
1799 1799          if (dolock && rwtype == RW_WRITER) {
1800 1800                  rw_downgrade(&ip->i_contents);
1801 1801          }
1802 1802  #endif
1803 1803  
1804 1804          /*
1805 1805           * We remove PROT_WRITE in cases when the file has UDF holes
1806 1806           * because we don't  want to call bmap_read() to check each
1807 1807           * page if it is backed with a disk block.
1808 1808           */
1809 1809          if (protp && has_holes && rw != S_WRITE && rw != S_CREATE) {
1810 1810                  *protp &= ~PROT_WRITE;
1811 1811          }
1812 1812  
1813 1813          error = 0;
1814 1814  
1815 1815          /*
1816 1816           * The loop looks up pages in the range <off, off + len).
1817 1817           * For each page, we first check if we should initiate an asynchronous
1818 1818           * read ahead before we call page_lookup (we may sleep in page_lookup
1819 1819           * for a previously initiated disk read).
1820 1820           */
1821 1821          eoff = (uoff + len);
1822 1822          for (pgoff = uoff, pgaddr = addr, pl = plarr;
1823 1823              pgoff < eoff; /* empty */) {
1824 1824                  page_t  *pp;
1825 1825                  u_offset_t      nextrio;
1826 1826                  se_t    se;
1827 1827  
1828 1828                  se = ((rw == S_CREATE) ? SE_EXCL : SE_SHARED);
1829 1829  
1830 1830                  /*
1831 1831                   * Handle async getpage (faultahead)
1832 1832                   */
1833 1833                  if (plarr == NULL) {
1834 1834                          ip->i_nextrio = pgoff;
1835 1835                          ud_getpage_ra(vp, pgoff, seg, pgaddr);
1836 1836                          pgoff += pgsize;
1837 1837                          pgaddr += pgsize;
1838 1838                          continue;
1839 1839                  }
1840 1840  
1841 1841                  /*
1842 1842                   * Check if we should initiate read ahead of next cluster.
1843 1843                   * We call page_exists only when we need to confirm that
1844 1844                   * we have the current page before we initiate the read ahead.
1845 1845                   */
1846 1846                  nextrio = ip->i_nextrio;
1847 1847                  if (seqmode &&
1848 1848                      pgoff + RD_CLUSTSZ(ip) >= nextrio && pgoff <= nextrio &&
1849 1849                      nextrio < ip->i_size && page_exists(vp, pgoff))
1850 1850                          ud_getpage_ra(vp, pgoff, seg, pgaddr);
1851 1851  
1852 1852                  if ((pp = page_lookup(vp, pgoff, se)) != NULL) {
1853 1853  
1854 1854                          /*
1855 1855                           * We found the page in the page cache.
1856 1856                           */
1857 1857                          *pl++ = pp;
1858 1858                          pgoff += pgsize;
1859 1859                          pgaddr += pgsize;
1860 1860                          len -= pgsize;
1861 1861                          plsz -= pgsize;
1862 1862                  } else  {
1863 1863  
1864 1864                          /*
1865 1865                           * We have to create the page, or read it from disk.
1866 1866                           */
1867 1867                          if (error = ud_getpage_miss(vp, pgoff, len,
1868 1868                              seg, pgaddr, pl, plsz, rw, seqmode)) {
1869 1869                                  goto error_out;
1870 1870                          }
1871 1871  
1872 1872                          while (*pl != NULL) {
1873 1873                                  pl++;
1874 1874                                  pgoff += pgsize;
1875 1875                                  pgaddr += pgsize;
1876 1876                                  len -= pgsize;
1877 1877                                  plsz -= pgsize;
1878 1878                          }
1879 1879                  }
1880 1880          }
1881 1881  
1882 1882          /*
1883 1883           * Return pages up to plsz if they are in the page cache.
1884 1884           * We cannot return pages if there is a chance that they are
1885 1885           * backed with a UDF hole and rw is S_WRITE or S_CREATE.
1886 1886           */
1887 1887          if (plarr && !(has_holes && (rw == S_WRITE || rw == S_CREATE))) {
1888 1888  
1889 1889                  ASSERT((protp == NULL) ||
1890 1890                      !(has_holes && (*protp & PROT_WRITE)));
1891 1891  
1892 1892                  eoff = pgoff + plsz;
1893 1893                  while (pgoff < eoff) {
1894 1894                          page_t          *pp;
1895 1895  
1896 1896                          if ((pp = page_lookup_nowait(vp, pgoff,
1897 1897                              SE_SHARED)) == NULL)
1898 1898                                  break;
1899 1899  
1900 1900                          *pl++ = pp;
1901 1901                          pgoff += pgsize;
1902 1902                          plsz -= pgsize;
1903 1903                  }
1904 1904          }
1905 1905  
1906 1906          if (plarr)
1907 1907                  *pl = NULL;                     /* Terminate page list */
1908 1908          ip->i_nextr = pgoff;
1909 1909  
1910 1910  error_out:
1911 1911          if (error && plarr) {
1912 1912                  /*
1913 1913                   * Release any pages we have locked.
1914 1914                   */
1915 1915                  while (pl > &plarr[0])
1916 1916                          page_unlock(*--pl);
1917 1917  
1918 1918                  plarr[0] = NULL;
1919 1919          }
1920 1920  
1921 1921  update_inode:
1922 1922  #ifdef  __lock_lint
1923 1923          rw_exit(&ip->i_contents);
1924 1924  #else
1925 1925          if (dolock) {
1926 1926                  rw_exit(&ip->i_contents);
1927 1927          }
1928 1928  #endif
1929 1929  
1930 1930          /*
1931 1931           * If the inode is not already marked for IACC (in rwip() for read)
1932 1932           * and the inode is not marked for no access time update (in rwip()
1933 1933           * for write) then update the inode access time and mod time now.
1934 1934           */
1935 1935          mutex_enter(&ip->i_tlock);
1936 1936          if ((ip->i_flag & (IACC | INOACC)) == 0) {
1937 1937                  if ((rw != S_OTHER) && (ip->i_type != VDIR)) {
1938 1938                          ip->i_flag |= IACC;
1939 1939                  }
1940 1940                  if (rw == S_WRITE) {
1941 1941                          ip->i_flag |= IUPD;
1942 1942                  }
1943 1943                  ITIMES_NOLOCK(ip);
1944 1944          }
1945 1945          mutex_exit(&ip->i_tlock);
1946 1946  
1947 1947          return (error);
1948 1948  }
1949 1949  
1950 1950  int32_t ud_delay = 1;
1951 1951  
1952 1952  /* ARGSUSED */
1953 1953  static int32_t
1954 1954  udf_putpage(
1955 1955          struct vnode *vp,
1956 1956          offset_t off,
1957 1957          size_t len,
1958 1958          int32_t flags,
1959 1959          struct cred *cr,
1960 1960          caller_context_t *ct)
1961 1961  {
1962 1962          struct ud_inode *ip;
1963 1963          int32_t error = 0;
1964 1964  
1965 1965          ud_printf("udf_putpage\n");
1966 1966  
1967 1967          ip = VTOI(vp);
1968 1968  #ifdef  __lock_lint
1969 1969          rw_enter(&ip->i_contents, RW_WRITER);
1970 1970  #endif
1971 1971  
1972 1972          if (vp->v_count == 0) {
1973 1973                  cmn_err(CE_WARN, "ud_putpage : bad v_count");
1974 1974                  error = EINVAL;
1975 1975                  goto out;
1976 1976          }
1977 1977  
1978 1978          if (vp->v_flag & VNOMAP) {
1979 1979                  error = ENOSYS;
1980 1980                  goto out;
1981 1981          }
1982 1982  
1983 1983          if (flags & B_ASYNC) {
1984 1984                  if (ud_delay && len &&
1985 1985                      (flags & ~(B_ASYNC|B_DONTNEED|B_FREE)) == 0) {
1986 1986                          mutex_enter(&ip->i_tlock);
1987 1987  
1988 1988                          /*
1989 1989                           * If nobody stalled, start a new cluster.
1990 1990                           */
1991 1991                          if (ip->i_delaylen == 0) {
1992 1992                                  ip->i_delayoff = off;
1993 1993                                  ip->i_delaylen = len;
1994 1994                                  mutex_exit(&ip->i_tlock);
1995 1995                                  goto out;
1996 1996                          }
1997 1997  
1998 1998                          /*
1999 1999                           * If we have a full cluster or they are not contig,
2000 2000                           * then push last cluster and start over.
2001 2001                           */
2002 2002                          if (ip->i_delaylen >= WR_CLUSTSZ(ip) ||
2003 2003                              ip->i_delayoff + ip->i_delaylen != off) {
2004 2004                                  u_offset_t doff;
2005 2005                                  size_t dlen;
2006 2006  
2007 2007                                  doff = ip->i_delayoff;
2008 2008                                  dlen = ip->i_delaylen;
2009 2009                                  ip->i_delayoff = off;
2010 2010                                  ip->i_delaylen = len;
2011 2011                                  mutex_exit(&ip->i_tlock);
2012 2012                                  error = ud_putpages(vp, doff, dlen, flags, cr);
2013 2013                                  /* LMXXX - flags are new val, not old */
2014 2014                                  goto out;
2015 2015                          }
2016 2016  
2017 2017                          /*
2018 2018                           * There is something there, it's not full, and
2019 2019                           * it is contig.
2020 2020                           */
2021 2021                          ip->i_delaylen += len;
2022 2022                          mutex_exit(&ip->i_tlock);
2023 2023                          goto out;
2024 2024                  }
2025 2025  
2026 2026                  /*
2027 2027                   * Must have weird flags or we are not clustering.
2028 2028                   */
2029 2029          }
2030 2030  
2031 2031          error = ud_putpages(vp, off, len, flags, cr);
2032 2032  
2033 2033  out:
2034 2034  #ifdef  __lock_lint
2035 2035          rw_exit(&ip->i_contents);
2036 2036  #endif
2037 2037          return (error);
2038 2038  }
2039 2039  
2040 2040  /* ARGSUSED */
2041 2041  static int32_t
2042 2042  udf_map(
2043 2043          struct vnode *vp,
2044 2044          offset_t off,
2045 2045          struct as *as,
2046 2046          caddr_t *addrp,
2047 2047          size_t len,
2048 2048          uint8_t prot,
2049 2049          uint8_t maxprot,
2050 2050          uint32_t flags,
2051 2051          struct cred *cr,
2052 2052          caller_context_t *ct)
2053 2053  {
2054 2054          struct segvn_crargs vn_a;
2055 2055          int32_t error = 0;
2056 2056  
2057 2057          ud_printf("udf_map\n");
2058 2058  
2059 2059          if (vp->v_flag & VNOMAP) {
2060 2060                  error = ENOSYS;
2061 2061                  goto end;
2062 2062          }
2063 2063  
2064 2064          if ((off < (offset_t)0) ||
2065 2065              ((off + len) < (offset_t)0)) {
2066 2066                  error = EINVAL;
2067 2067                  goto end;
2068 2068          }
2069 2069  
2070 2070          if (vp->v_type != VREG) {
2071 2071                  error = ENODEV;
2072 2072                  goto end;
2073 2073          }
2074 2074  
2075 2075          /*
2076 2076           * If file is being locked, disallow mapping.
2077 2077           */
2078 2078          if (vn_has_mandatory_locks(vp, VTOI(vp)->i_char)) {
2079 2079                  error = EAGAIN;
2080 2080                  goto end;
2081 2081          }
2082 2082  
2083 2083          as_rangelock(as);
2084 2084          error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
2085 2085          if (error != 0) {
2086 2086                  as_rangeunlock(as);
2087 2087                  goto end;
2088 2088          }
2089 2089  
2090 2090          vn_a.vp = vp;
2091 2091          vn_a.offset = off;
2092 2092          vn_a.type = flags & MAP_TYPE;
2093 2093          vn_a.prot = prot;
2094 2094          vn_a.maxprot = maxprot;
2095 2095          vn_a.cred = cr;
2096 2096          vn_a.amp = NULL;
2097 2097          vn_a.flags = flags & ~MAP_TYPE;
2098 2098          vn_a.szc = 0;
2099 2099          vn_a.lgrp_mem_policy_flags = 0;
2100 2100  
2101 2101          error = as_map(as, *addrp, len, segvn_create, (caddr_t)&vn_a);
2102 2102          as_rangeunlock(as);
2103 2103  
2104 2104  end:
2105 2105          return (error);
2106 2106  }
2107 2107  
2108 2108  /* ARGSUSED */
2109 2109  static int32_t
2110 2110  udf_addmap(struct vnode *vp,
2111 2111          offset_t off,
2112 2112          struct as *as,
2113 2113          caddr_t addr,
2114 2114          size_t len,
2115 2115          uint8_t prot,
2116 2116          uint8_t maxprot,
2117 2117          uint32_t flags,
2118 2118          struct cred *cr,
2119 2119          caller_context_t *ct)
2120 2120  {
2121 2121          struct ud_inode *ip = VTOI(vp);
2122 2122  
2123 2123          ud_printf("udf_addmap\n");
2124 2124  
2125 2125          if (vp->v_flag & VNOMAP) {
2126 2126                  return (ENOSYS);
2127 2127          }
2128 2128  
2129 2129          mutex_enter(&ip->i_tlock);
2130 2130          ip->i_mapcnt += btopr(len);
2131 2131          mutex_exit(&ip->i_tlock);
2132 2132  
2133 2133          return (0);
2134 2134  }
2135 2135  
2136 2136  /* ARGSUSED */
2137 2137  static int32_t
2138 2138  udf_delmap(
2139 2139          struct vnode *vp, offset_t off,
2140 2140          struct as *as,
2141 2141          caddr_t addr,
2142 2142          size_t len,
2143 2143          uint32_t prot,
2144 2144          uint32_t maxprot,
2145 2145          uint32_t flags,
2146 2146          struct cred *cr,
2147 2147          caller_context_t *ct)
2148 2148  {
2149 2149          struct ud_inode *ip = VTOI(vp);
2150 2150  
2151 2151          ud_printf("udf_delmap\n");
2152 2152  
2153 2153          if (vp->v_flag & VNOMAP) {
2154 2154                  return (ENOSYS);
2155 2155          }
2156 2156  
2157 2157          mutex_enter(&ip->i_tlock);
2158 2158          ip->i_mapcnt -= btopr(len);     /* Count released mappings */
2159 2159          ASSERT(ip->i_mapcnt >= 0);
2160 2160          mutex_exit(&ip->i_tlock);
2161 2161  
2162 2162          return (0);
2163 2163  }
2164 2164  
2165 2165  /* ARGSUSED */
2166 2166  static int32_t
2167 2167  udf_l_pathconf(
2168 2168          struct vnode *vp,
2169 2169          int32_t cmd,
2170 2170          ulong_t *valp,
2171 2171          struct cred *cr,
2172 2172          caller_context_t *ct)
2173 2173  {
2174 2174          int32_t error = 0;
2175 2175  
2176 2176          ud_printf("udf_l_pathconf\n");
2177 2177  
2178 2178          if (cmd == _PC_FILESIZEBITS) {
2179 2179                  /*
2180 2180                   * udf supports 64 bits as file size
2181 2181                   * but there are several other restrictions
2182 2182                   * it only supports 32-bit block numbers and
2183 2183                   * daddr32_t is only and int32_t so taking these
2184 2184                   * into account we can stay just as where ufs is
2185 2185                   */
2186 2186                  *valp = 41;
2187 2187          } else if (cmd == _PC_TIMESTAMP_RESOLUTION) {
2188 2188                  /* nanosecond timestamp resolution */
2189 2189                  *valp = 1L;
2190 2190          } else {
2191 2191                  error = fs_pathconf(vp, cmd, valp, cr, ct);
2192 2192          }
2193 2193  
2194 2194          return (error);
2195 2195  }
2196 2196  
2197 2197  uint32_t ud_pageio_reads = 0, ud_pageio_writes = 0;
2198 2198  #ifndef __lint
2199 2199  _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_reads))
2200 2200  _NOTE(SCHEME_PROTECTS_DATA("safe sharing", ud_pageio_writes))
2201 2201  #endif
2202 2202  /*
2203 2203   * Assumption is that there will not be a pageio request
2204 2204   * to a enbedded file
2205 2205   */
2206 2206  /* ARGSUSED */
2207 2207  static int32_t
2208 2208  udf_pageio(
2209 2209          struct vnode *vp,
2210 2210          struct page *pp,
2211 2211          u_offset_t io_off,
2212 2212          size_t io_len,
2213 2213          int32_t flags,
2214 2214          struct cred *cr,
2215 2215          caller_context_t *ct)
2216 2216  {
2217 2217          daddr_t bn;
2218 2218          struct buf *bp;
2219 2219          struct ud_inode *ip = VTOI(vp);
2220 2220          int32_t dolock, error = 0, contig, multi_io;
2221 2221          size_t done_len = 0, cur_len = 0;
2222 2222          page_t *npp = NULL, *opp = NULL, *cpp = pp;
2223 2223  
2224 2224          if (pp == NULL) {
2225 2225                  return (EINVAL);
2226 2226          }
2227 2227  
2228 2228          dolock = (rw_owner(&ip->i_contents) != curthread);
2229 2229  
2230 2230          /*
2231 2231           * We need a better check.  Ideally, we would use another
2232 2232           * vnodeops so that hlocked and forcibly unmounted file
2233 2233           * systems would return EIO where appropriate and w/o the
2234 2234           * need for these checks.
2235 2235           */
2236 2236          if (ip->i_udf == NULL) {
2237 2237                  return (EIO);
2238 2238          }
2239 2239  
2240 2240  #ifdef  __lock_lint
2241 2241          rw_enter(&ip->i_contents, RW_READER);
2242 2242  #else
2243 2243          if (dolock) {
2244 2244                  rw_enter(&ip->i_contents, RW_READER);
2245 2245          }
2246 2246  #endif
2247 2247  
2248 2248          /*
2249 2249           * Break the io request into chunks, one for each contiguous
2250 2250           * stretch of disk blocks in the target file.
2251 2251           */
2252 2252          while (done_len < io_len) {
2253 2253                  ASSERT(cpp);
2254 2254                  bp = NULL;
2255 2255                  contig = 0;
2256 2256                  if (error = ud_bmap_read(ip, (u_offset_t)(io_off + done_len),
2257 2257                      &bn, &contig)) {
2258 2258                          break;
2259 2259                  }
2260 2260  
2261 2261                  if (bn == UDF_HOLE) {   /* No holey swapfiles */
2262 2262                          cmn_err(CE_WARN, "SWAP file has HOLES");
2263 2263                          error = EINVAL;
2264 2264                          break;
2265 2265                  }
2266 2266  
2267 2267                  cur_len = MIN(io_len - done_len, contig);
2268 2268  
2269 2269                  /*
2270 2270                   * Check if more than one I/O is
2271 2271                   * required to complete the given
2272 2272                   * I/O operation
2273 2273                   */
2274 2274                  if (ip->i_udf->udf_lbsize < PAGESIZE) {
2275 2275                          if (cur_len >= PAGESIZE) {
2276 2276                                  multi_io = 0;
2277 2277                                  cur_len &= PAGEMASK;
2278 2278                          } else {
2279 2279                                  multi_io = 1;
2280 2280                                  cur_len = MIN(io_len - done_len, PAGESIZE);
2281 2281                          }
2282 2282                  }
2283 2283                  page_list_break(&cpp, &npp, btop(cur_len));
2284 2284  
2285 2285                  bp = pageio_setup(cpp, cur_len, ip->i_devvp, flags);
2286 2286                  ASSERT(bp != NULL);
2287 2287  
2288 2288                  bp->b_edev = ip->i_dev;
2289 2289                  bp->b_dev = cmpdev(ip->i_dev);
2290 2290                  bp->b_blkno = bn;
2291 2291                  bp->b_un.b_addr = (caddr_t)0;
2292 2292                  bp->b_file = vp;
2293 2293                  bp->b_offset = (offset_t)(io_off + done_len);
2294 2294  
2295 2295  /*
2296 2296   *              ub.ub_pageios.value.ul++;
2297 2297   */
2298 2298                  if (multi_io == 0) {
2299 2299                          (void) bdev_strategy(bp);
2300 2300                  } else {
2301 2301                          error = ud_multi_strat(ip, cpp, bp,
2302 2302                              (u_offset_t)(io_off + done_len));
2303 2303                          if (error != 0) {
2304 2304                                  pageio_done(bp);
2305 2305                                  break;
2306 2306                          }
2307 2307                  }
2308 2308                  if (flags & B_READ) {
2309 2309                          ud_pageio_reads++;
2310 2310                  } else {
2311 2311                          ud_pageio_writes++;
2312 2312                  }
2313 2313  
2314 2314                  /*
2315 2315                   * If the request is not B_ASYNC, wait for i/o to complete
2316 2316                   * and re-assemble the page list to return to the caller.
2317 2317                   * If it is B_ASYNC we leave the page list in pieces and
2318 2318                   * cleanup() will dispose of them.
2319 2319                   */
2320 2320                  if ((flags & B_ASYNC) == 0) {
2321 2321                          error = biowait(bp);
2322 2322                          pageio_done(bp);
2323 2323                          if (error) {
2324 2324                                  break;
2325 2325                          }
2326 2326                          page_list_concat(&opp, &cpp);
2327 2327                  }
2328 2328                  cpp = npp;
2329 2329                  npp = NULL;
2330 2330                  done_len += cur_len;
2331 2331          }
2332 2332  
2333 2333          ASSERT(error || (cpp == NULL && npp == NULL && done_len == io_len));
2334 2334          if (error) {
2335 2335                  if (flags & B_ASYNC) {
2336 2336                          /* Cleanup unprocessed parts of list */
2337 2337                          page_list_concat(&cpp, &npp);
2338 2338                          if (flags & B_READ) {
2339 2339                                  pvn_read_done(cpp, B_ERROR);
2340 2340                          } else {
2341 2341                                  pvn_write_done(cpp, B_ERROR);
2342 2342                          }
2343 2343                  } else {
2344 2344                          /* Re-assemble list and let caller clean up */
2345 2345                          page_list_concat(&opp, &cpp);
2346 2346                          page_list_concat(&opp, &npp);
2347 2347                  }
2348 2348          }
2349 2349  
2350 2350  #ifdef  __lock_lint
2351 2351          rw_exit(&ip->i_contents);
2352 2352  #else
2353 2353          if (dolock) {
2354 2354                  rw_exit(&ip->i_contents);
2355 2355          }
2356 2356  #endif
2357 2357          return (error);
2358 2358  }
2359 2359  
2360 2360  
2361 2361  
2362 2362  
2363 2363  /* -------------------- local functions --------------------------- */
2364 2364  
2365 2365  
2366 2366  
2367 2367  int32_t
2368 2368  ud_rdwri(enum uio_rw rw, int32_t ioflag,
2369 2369          struct ud_inode *ip, caddr_t base, int32_t len,
2370 2370          offset_t offset, enum uio_seg seg, int32_t *aresid, struct cred *cr)
2371 2371  {
2372 2372          int32_t error;
2373 2373          struct uio auio;
2374 2374          struct iovec aiov;
2375 2375  
2376 2376          ud_printf("ud_rdwri\n");
2377 2377  
2378 2378          bzero((caddr_t)&auio, sizeof (uio_t));
2379 2379          bzero((caddr_t)&aiov, sizeof (iovec_t));
2380 2380  
2381 2381          aiov.iov_base = base;
2382 2382          aiov.iov_len = len;
2383 2383          auio.uio_iov = &aiov;
2384 2384          auio.uio_iovcnt = 1;
2385 2385          auio.uio_loffset = offset;
2386 2386          auio.uio_segflg = (int16_t)seg;
2387 2387          auio.uio_resid = len;
2388 2388  
2389 2389          if (rw == UIO_WRITE) {
2390 2390                  auio.uio_fmode = FWRITE;
2391 2391                  auio.uio_extflg = UIO_COPY_DEFAULT;
2392 2392                  auio.uio_llimit = curproc->p_fsz_ctl;
2393 2393                  error = ud_wrip(ip, &auio, ioflag, cr);
2394 2394          } else {
2395 2395                  auio.uio_fmode = FREAD;
2396 2396                  auio.uio_extflg = UIO_COPY_CACHED;
2397 2397                  auio.uio_llimit = MAXOFFSET_T;
2398 2398                  error = ud_rdip(ip, &auio, ioflag, cr);
2399 2399          }
2400 2400  
2401 2401          if (aresid) {
2402 2402                  *aresid = auio.uio_resid;
2403 2403          } else if (auio.uio_resid) {
2404 2404                  error = EIO;
2405 2405          }
2406 2406          return (error);
2407 2407  }
2408 2408  
2409 2409  /*
2410 2410   * Free behind hacks.  The pager is busted.
2411 2411   * XXX - need to pass the information down to writedone() in a flag like B_SEQ
2412 2412   * or B_FREE_IF_TIGHT_ON_MEMORY.
2413 2413   */
2414 2414  int32_t ud_freebehind = 1;
2415 2415  int32_t ud_smallfile = 32 * 1024;
2416 2416  
2417 2417  /* ARGSUSED */
2418 2418  int32_t
2419 2419  ud_getpage_miss(struct vnode *vp, u_offset_t off,
2420 2420          size_t len, struct seg *seg, caddr_t addr, page_t *pl[],
2421 2421          size_t plsz, enum seg_rw rw, int32_t seq)
2422 2422  {
2423 2423          struct ud_inode *ip = VTOI(vp);
2424 2424          int32_t err = 0;
2425 2425          size_t io_len;
2426 2426          u_offset_t io_off;
2427 2427          u_offset_t pgoff;
2428 2428          page_t *pp;
2429 2429  
2430 2430          pl[0] = NULL;
2431 2431  
2432 2432          /*
2433 2433           * Figure out whether the page can be created, or must be
2434 2434           * read from the disk
2435 2435           */
2436 2436          if (rw == S_CREATE) {
2437 2437                  if ((pp = page_create_va(vp, off,
2438 2438                      PAGESIZE, PG_WAIT, seg, addr)) == NULL) {
2439 2439                          cmn_err(CE_WARN, "ud_getpage_miss: page_create");
2440 2440                          return (EINVAL);
2441 2441                  }
2442 2442                  io_len = PAGESIZE;
2443 2443          } else {
2444 2444                  pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
2445 2445                      &io_len, off, PAGESIZE, 0);
2446 2446  
2447 2447                  /*
2448 2448                   * Some other thread has entered the page.
2449 2449                   * ud_getpage will retry page_lookup.
2450 2450                   */
2451 2451                  if (pp == NULL) {
2452 2452                          return (0);
2453 2453                  }
2454 2454  
2455 2455                  /*
2456 2456                   * Fill the page with as much data as we can from the file.
2457 2457                   */
2458 2458                  err = ud_page_fill(ip, pp, off, B_READ, &pgoff);
2459 2459                  if (err) {
2460 2460                          pvn_read_done(pp, B_ERROR);
2461 2461                          return (err);
2462 2462                  }
2463 2463  
2464 2464                  /*
2465 2465                   * XXX ??? ufs has io_len instead of pgoff below
2466 2466                   */
2467 2467                  ip->i_nextrio = off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2468 2468  
2469 2469                  /*
2470 2470                   * If the file access is sequential, initiate read ahead
2471 2471                   * of the next cluster.
2472 2472                   */
2473 2473                  if (seq && ip->i_nextrio < ip->i_size) {
2474 2474                          ud_getpage_ra(vp, off, seg, addr);
2475 2475                  }
2476 2476          }
2477 2477  
2478 2478  outmiss:
2479 2479          pvn_plist_init(pp, pl, plsz, (offset_t)off, io_len, rw);
2480 2480          return (err);
2481 2481  }
2482 2482  
2483 2483  /* ARGSUSED */
2484 2484  void
2485 2485  ud_getpage_ra(struct vnode *vp,
2486 2486          u_offset_t off, struct seg *seg, caddr_t addr)
2487 2487  {
2488 2488          page_t *pp;
2489 2489          size_t io_len;
2490 2490          struct ud_inode *ip = VTOI(vp);
2491 2491          u_offset_t io_off = ip->i_nextrio, pgoff;
2492 2492          caddr_t addr2 = addr + (io_off - off);
2493 2493          daddr_t bn;
2494 2494          int32_t contig = 0;
2495 2495  
2496 2496          /*
2497 2497           * Is this test needed?
2498 2498           */
2499 2499  
2500 2500          if (addr2 >= seg->s_base + seg->s_size) {
2501 2501                  return;
2502 2502          }
2503 2503  
2504 2504          contig = 0;
2505 2505          if (ud_bmap_read(ip, io_off, &bn, &contig) != 0 || bn == UDF_HOLE) {
2506 2506                  return;
2507 2507          }
2508 2508  
2509 2509          pp = pvn_read_kluster(vp, io_off, seg, addr2,
2510 2510              &io_off, &io_len, io_off, PAGESIZE, 1);
2511 2511  
2512 2512          /*
2513 2513           * Some other thread has entered the page.
2514 2514           * So no read head done here (ie we will have to and wait
2515 2515           * for the read when needed).
2516 2516           */
2517 2517  
2518 2518          if (pp == NULL) {
2519 2519                  return;
2520 2520          }
2521 2521  
2522 2522          (void) ud_page_fill(ip, pp, io_off, (B_READ|B_ASYNC), &pgoff);
2523 2523          ip->i_nextrio =  io_off + ((pgoff + PAGESIZE - 1) & PAGEMASK);
2524 2524  }
2525 2525  
2526 2526  int
2527 2527  ud_page_fill(struct ud_inode *ip, page_t *pp, u_offset_t off,
2528 2528          uint32_t bflgs, u_offset_t *pg_off)
2529 2529  {
2530 2530          daddr_t bn;
2531 2531          struct buf *bp;
2532 2532          caddr_t kaddr, caddr;
2533 2533          int32_t error = 0, contig = 0, multi_io = 0;
2534 2534          int32_t lbsize = ip->i_udf->udf_lbsize;
2535 2535          int32_t lbmask = ip->i_udf->udf_lbmask;
2536 2536          uint64_t isize;
2537 2537  
2538 2538          isize = (ip->i_size + lbmask) & (~lbmask);
2539 2539          if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2540 2540  
2541 2541                  /*
2542 2542                   * Embedded file read file_entry
2543 2543                   * from buffer cache and copy the required
2544 2544                   * portions
2545 2545                   */
2546 2546                  bp = ud_bread(ip->i_dev,
2547 2547                      ip->i_icb_lbano << ip->i_udf->udf_l2d_shift, lbsize);
2548 2548                  if ((bp->b_error == 0) &&
2549 2549                      (bp->b_resid == 0)) {
2550 2550  
2551 2551                          caddr = bp->b_un.b_addr + ip->i_data_off;
2552 2552  
2553 2553                          /*
2554 2554                           * mapin to kvm
2555 2555                           */
2556 2556                          kaddr = (caddr_t)ppmapin(pp,
2557 2557                              PROT_READ | PROT_WRITE, (caddr_t)-1);
2558 2558                          (void) kcopy(caddr, kaddr, ip->i_size);
2559 2559  
2560 2560                          /*
2561 2561                           * mapout of kvm
2562 2562                           */
2563 2563                          ppmapout(kaddr);
2564 2564                  }
2565 2565                  brelse(bp);
2566 2566                  contig = ip->i_size;
2567 2567          } else {
2568 2568  
2569 2569                  /*
2570 2570                   * Get the continuous size and block number
2571 2571                   * at offset "off"
2572 2572                   */
2573 2573                  if (error = ud_bmap_read(ip, off, &bn, &contig))
2574 2574                          goto out;
2575 2575                  contig = MIN(contig, PAGESIZE);
2576 2576                  contig = (contig + lbmask) & (~lbmask);
2577 2577  
2578 2578                  /*
2579 2579                   * Zero part of the page which we are not
2580 2580                   * going to read from the disk.
2581 2581                   */
2582 2582  
2583 2583                  if (bn == UDF_HOLE) {
2584 2584  
2585 2585                          /*
2586 2586                           * This is a HOLE. Just zero out
2587 2587                           * the page
2588 2588                           */
2589 2589                          if (((off + contig) == isize) ||
2590 2590                              (contig == PAGESIZE)) {
2591 2591                                  pagezero(pp->p_prev, 0, PAGESIZE);
2592 2592                                  goto out;
2593 2593                          }
2594 2594                  }
2595 2595  
2596 2596                  if (contig < PAGESIZE) {
2597 2597                          uint64_t count;
2598 2598  
2599 2599                          count = isize - off;
2600 2600                          if (contig != count) {
2601 2601                                  multi_io = 1;
2602 2602                                  contig = (int32_t)(MIN(count, PAGESIZE));
2603 2603                          } else {
2604 2604                                  pagezero(pp->p_prev, contig, PAGESIZE - contig);
2605 2605                          }
2606 2606                  }
2607 2607  
2608 2608                  /*
2609 2609                   * Get a bp and initialize it
2610 2610                   */
2611 2611                  bp = pageio_setup(pp, contig, ip->i_devvp, bflgs);
2612 2612                  ASSERT(bp != NULL);
2613 2613  
2614 2614                  bp->b_edev = ip->i_dev;
2615 2615                  bp->b_dev = cmpdev(ip->i_dev);
2616 2616                  bp->b_blkno = bn;
2617 2617                  bp->b_un.b_addr = 0;
2618 2618                  bp->b_file = ip->i_vnode;
2619 2619  
2620 2620                  /*
2621 2621                   * Start I/O
2622 2622                   */
2623 2623                  if (multi_io == 0) {
2624 2624  
2625 2625                          /*
2626 2626                           * Single I/O is sufficient for this page
2627 2627                           */
2628 2628                          (void) bdev_strategy(bp);
2629 2629                  } else {
2630 2630  
2631 2631                          /*
2632 2632                           * We need to do the I/O in
2633 2633                           * piece's
2634 2634                           */
2635 2635                          error = ud_multi_strat(ip, pp, bp, off);
2636 2636                          if (error != 0) {
2637 2637                                  goto out;
2638 2638                          }
2639 2639                  }
2640 2640                  if ((bflgs & B_ASYNC) == 0) {
2641 2641  
2642 2642                          /*
2643 2643                           * Wait for i/o to complete.
2644 2644                           */
2645 2645  
2646 2646                          error = biowait(bp);
2647 2647                          pageio_done(bp);
2648 2648                          if (error) {
2649 2649                                  goto out;
2650 2650                          }
2651 2651                  }
2652 2652          }
2653 2653          if ((off + contig) >= ip->i_size) {
2654 2654                  contig = ip->i_size - off;
2655 2655          }
2656 2656  
2657 2657  out:
2658 2658          *pg_off = contig;
2659 2659          return (error);
2660 2660  }
2661 2661  
2662 2662  int32_t
2663 2663  ud_putpages(struct vnode *vp, offset_t off,
2664 2664          size_t len, int32_t flags, struct cred *cr)
2665 2665  {
2666 2666          struct ud_inode *ip;
2667 2667          page_t *pp;
2668 2668          u_offset_t io_off;
2669 2669          size_t io_len;
2670 2670          u_offset_t eoff;
2671 2671          int32_t err = 0;
2672 2672          int32_t dolock;
2673 2673  
2674 2674          ud_printf("ud_putpages\n");
2675 2675  
2676 2676          if (vp->v_count == 0) {
2677 2677                  cmn_err(CE_WARN, "ud_putpages: bad v_count");
2678 2678                  return (EINVAL);
2679 2679          }
2680 2680  
2681 2681          ip = VTOI(vp);
2682 2682  
2683 2683          /*
2684 2684           * Acquire the readers/write inode lock before locking
2685 2685           * any pages in this inode.
2686 2686           * The inode lock is held during i/o.
2687 2687           */
2688 2688          if (len == 0) {
2689 2689                  mutex_enter(&ip->i_tlock);
2690 2690                  ip->i_delayoff = ip->i_delaylen = 0;
2691 2691                  mutex_exit(&ip->i_tlock);
2692 2692          }
2693 2693  #ifdef  __lock_lint
2694 2694          rw_enter(&ip->i_contents, RW_READER);
2695 2695  #else
2696 2696          dolock = (rw_owner(&ip->i_contents) != curthread);
2697 2697          if (dolock) {
2698 2698                  rw_enter(&ip->i_contents, RW_READER);
2699 2699          }
2700 2700  #endif
2701 2701  
2702 2702          if (!vn_has_cached_data(vp)) {
2703 2703  #ifdef  __lock_lint
2704 2704                  rw_exit(&ip->i_contents);
2705 2705  #else
2706 2706                  if (dolock) {
2707 2707                          rw_exit(&ip->i_contents);
2708 2708                  }
2709 2709  #endif
2710 2710                  return (0);
2711 2711          }
2712 2712  
2713 2713          if (len == 0) {
2714 2714                  /*
2715 2715                   * Search the entire vp list for pages >= off.
2716 2716                   */
2717 2717                  err = pvn_vplist_dirty(vp, (u_offset_t)off, ud_putapage,
2718 2718                      flags, cr);
2719 2719          } else {
2720 2720                  /*
2721 2721                   * Loop over all offsets in the range looking for
2722 2722                   * pages to deal with.
2723 2723                   */
2724 2724                  if ((eoff = blkroundup(ip->i_udf, ip->i_size)) != 0) {
2725 2725                          eoff = MIN(off + len, eoff);
2726 2726                  } else {
2727 2727                          eoff = off + len;
2728 2728                  }
2729 2729  
2730 2730                  for (io_off = off; io_off < eoff; io_off += io_len) {
2731 2731                          /*
2732 2732                           * If we are not invalidating, synchronously
2733 2733                           * freeing or writing pages, use the routine
2734 2734                           * page_lookup_nowait() to prevent reclaiming
2735 2735                           * them from the free list.
2736 2736                           */
2737 2737                          if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
2738 2738                                  pp = page_lookup(vp, io_off,
2739 2739                                      (flags & (B_INVAL | B_FREE)) ?
2740 2740                                      SE_EXCL : SE_SHARED);
2741 2741                          } else {
2742 2742                                  pp = page_lookup_nowait(vp, io_off,
2743 2743                                      (flags & B_FREE) ? SE_EXCL : SE_SHARED);
2744 2744                          }
2745 2745  
2746 2746                          if (pp == NULL || pvn_getdirty(pp, flags) == 0) {
2747 2747                                  io_len = PAGESIZE;
2748 2748                          } else {
2749 2749  
2750 2750                                  err = ud_putapage(vp, pp,
2751 2751                                      &io_off, &io_len, flags, cr);
2752 2752                                  if (err != 0) {
2753 2753                                          break;
2754 2754                                  }
2755 2755                                  /*
2756 2756                                   * "io_off" and "io_len" are returned as
2757 2757                                   * the range of pages we actually wrote.
2758 2758                                   * This allows us to skip ahead more quickly
2759 2759                                   * since several pages may've been dealt
2760 2760                                   * with by this iteration of the loop.
2761 2761                                   */
2762 2762                          }
2763 2763                  }
2764 2764          }
2765 2765          if (err == 0 && off == 0 && (len == 0 || len >= ip->i_size)) {
2766 2766                  /*
2767 2767                   * We have just sync'ed back all the pages on
2768 2768                   * the inode, turn off the IMODTIME flag.
2769 2769                   */
2770 2770                  mutex_enter(&ip->i_tlock);
2771 2771                  ip->i_flag &= ~IMODTIME;
2772 2772                  mutex_exit(&ip->i_tlock);
2773 2773          }
2774 2774  #ifdef  __lock_lint
2775 2775          rw_exit(&ip->i_contents);
2776 2776  #else
2777 2777          if (dolock) {
2778 2778                  rw_exit(&ip->i_contents);
2779 2779          }
2780 2780  #endif
2781 2781          return (err);
2782 2782  }
2783 2783  
2784 2784  /* ARGSUSED */
2785 2785  int32_t
2786 2786  ud_putapage(struct vnode *vp,
2787 2787          page_t *pp, u_offset_t *offp,
2788 2788          size_t *lenp, int32_t flags, struct cred *cr)
2789 2789  {
2790 2790          daddr_t bn;
2791 2791          size_t io_len;
2792 2792          struct ud_inode *ip;
2793 2793          int32_t error = 0, contig, multi_io = 0;
2794 2794          struct udf_vfs *udf_vfsp;
2795 2795          u_offset_t off, io_off;
2796 2796          caddr_t kaddr, caddr;
2797 2797          struct buf *bp = NULL;
2798 2798          int32_t lbmask;
2799 2799          uint64_t isize;
2800 2800          uint16_t crc_len;
2801 2801          struct file_entry *fe;
2802 2802  
2803 2803          ud_printf("ud_putapage\n");
2804 2804  
2805 2805          ip = VTOI(vp);
2806 2806          ASSERT(ip);
2807 2807          ASSERT(RW_LOCK_HELD(&ip->i_contents));
2808 2808          lbmask = ip->i_udf->udf_lbmask;
2809 2809          isize = (ip->i_size + lbmask) & (~lbmask);
2810 2810  
2811 2811          udf_vfsp = ip->i_udf;
2812 2812          ASSERT(udf_vfsp->udf_flags & UDF_FL_RW);
2813 2813  
2814 2814          /*
2815 2815           * If the modified time on the inode has not already been
2816 2816           * set elsewhere (e.g. for write/setattr) we set the time now.
2817 2817           * This gives us approximate modified times for mmap'ed files
2818 2818           * which are modified via stores in the user address space.
2819 2819           */
2820 2820          if (((ip->i_flag & IMODTIME) == 0) || (flags & B_FORCE)) {
2821 2821                  mutex_enter(&ip->i_tlock);
2822 2822                  ip->i_flag |= IUPD;
2823 2823                  ITIMES_NOLOCK(ip);
2824 2824                  mutex_exit(&ip->i_tlock);
2825 2825          }
2826 2826  
2827 2827  
2828 2828          /*
2829 2829           * Align the request to a block boundry (for old file systems),
2830 2830           * and go ask bmap() how contiguous things are for this file.
2831 2831           */
2832 2832          off = pp->p_offset & ~(offset_t)lbmask;
2833 2833                                  /* block align it */
2834 2834  
2835 2835  
2836 2836          if (ip->i_desc_type == ICB_FLAG_ONE_AD) {
2837 2837                  ASSERT(ip->i_size <= ip->i_max_emb);
2838 2838  
2839 2839                  pp = pvn_write_kluster(vp, pp, &io_off,
2840 2840                      &io_len, off, PAGESIZE, flags);
2841 2841                  if (io_len == 0) {
2842 2842                          io_len = PAGESIZE;
2843 2843                  }
2844 2844  
2845 2845                  bp = ud_bread(ip->i_dev,
2846 2846                      ip->i_icb_lbano << udf_vfsp->udf_l2d_shift,
2847 2847                      udf_vfsp->udf_lbsize);
2848 2848                  fe = (struct file_entry *)bp->b_un.b_addr;
2849 2849                  if ((bp->b_flags & B_ERROR) ||
2850 2850                      (ud_verify_tag_and_desc(&fe->fe_tag, UD_FILE_ENTRY,
2851 2851                      ip->i_icb_block,
2852 2852                      1, udf_vfsp->udf_lbsize) != 0)) {
2853 2853                          if (pp != NULL)
2854 2854                                  pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2855 2855                          if (bp->b_flags & B_ERROR) {
2856 2856                                  error = EIO;
2857 2857                          } else {
2858 2858                                  error = EINVAL;
2859 2859                          }
2860 2860                          brelse(bp);
2861 2861                          return (error);
2862 2862                  }
2863 2863                  if ((bp->b_error == 0) &&
2864 2864                      (bp->b_resid == 0)) {
2865 2865  
2866 2866                          caddr = bp->b_un.b_addr + ip->i_data_off;
2867 2867                          kaddr = (caddr_t)ppmapin(pp,
2868 2868                              PROT_READ | PROT_WRITE, (caddr_t)-1);
2869 2869                          (void) kcopy(kaddr, caddr, ip->i_size);
2870 2870                          ppmapout(kaddr);
2871 2871                  }
2872 2872                  crc_len = offsetof(struct file_entry, fe_spec) +
2873 2873                      SWAP_32(fe->fe_len_ear);
2874 2874                  crc_len += ip->i_size;
2875 2875                  ud_make_tag(ip->i_udf, &fe->fe_tag,
2876 2876                      UD_FILE_ENTRY, ip->i_icb_block, crc_len);
2877 2877  
2878 2878                  bwrite(bp);
2879 2879  
2880 2880                  if (flags & B_ASYNC) {
2881 2881                          pvn_write_done(pp, flags);
2882 2882                  }
2883 2883                  contig = ip->i_size;
2884 2884          } else {
2885 2885  
2886 2886                  if (error = ud_bmap_read(ip, off, &bn, &contig)) {
2887 2887                          goto out;
2888 2888                  }
2889 2889                  contig = MIN(contig, PAGESIZE);
2890 2890                  contig = (contig + lbmask) & (~lbmask);
2891 2891  
2892 2892                  if (contig < PAGESIZE) {
2893 2893                          uint64_t count;
2894 2894  
2895 2895                          count = isize - off;
2896 2896                          if (contig != count) {
2897 2897                                  multi_io = 1;
2898 2898                                  contig = (int32_t)(MIN(count, PAGESIZE));
2899 2899                          }
2900 2900                  }
2901 2901  
2902 2902                  if ((off + contig) > isize) {
2903 2903                          contig = isize - off;
2904 2904                  }
2905 2905  
2906 2906                  if (contig > PAGESIZE) {
2907 2907                          if (contig & PAGEOFFSET) {
2908 2908                                  contig &= PAGEMASK;
2909 2909                          }
2910 2910                  }
2911 2911  
2912 2912                  pp = pvn_write_kluster(vp, pp, &io_off,
2913 2913                      &io_len, off, contig, flags);
2914 2914                  if (io_len == 0) {
2915 2915                          io_len = PAGESIZE;
2916 2916                  }
2917 2917  
2918 2918                  bp = pageio_setup(pp, contig, ip->i_devvp, B_WRITE | flags);
2919 2919                  ASSERT(bp != NULL);
2920 2920  
2921 2921                  bp->b_edev = ip->i_dev;
2922 2922                  bp->b_dev = cmpdev(ip->i_dev);
2923 2923                  bp->b_blkno = bn;
2924 2924                  bp->b_un.b_addr = 0;
2925 2925                  bp->b_file = vp;
2926 2926                  bp->b_offset = (offset_t)off;
2927 2927  
2928 2928  
2929 2929                  /*
2930 2930                   * write throttle
2931 2931                   */
2932 2932                  ASSERT(bp->b_iodone == NULL);
2933 2933                  bp->b_iodone = ud_iodone;
2934 2934                  mutex_enter(&ip->i_tlock);
2935 2935                  ip->i_writes += bp->b_bcount;
2936 2936                  mutex_exit(&ip->i_tlock);
2937 2937  
2938 2938                  if (multi_io == 0) {
2939 2939  
2940 2940                          (void) bdev_strategy(bp);
2941 2941                  } else {
2942 2942                          error = ud_multi_strat(ip, pp, bp, off);
2943 2943                          if (error != 0) {
2944 2944                                  goto out;
2945 2945                          }
2946 2946                  }
2947 2947  
2948 2948                  if ((flags & B_ASYNC) == 0) {
2949 2949                          /*
2950 2950                           * Wait for i/o to complete.
2951 2951                           */
2952 2952                          error = biowait(bp);
2953 2953                          pageio_done(bp);
2954 2954                  }
2955 2955          }
2956 2956  
2957 2957          if ((flags & B_ASYNC) == 0) {
2958 2958                  pvn_write_done(pp, ((error) ? B_ERROR : 0) | B_WRITE | flags);
2959 2959          }
2960 2960  
2961 2961          pp = NULL;
2962 2962  
2963 2963  out:
2964 2964          if (error != 0 && pp != NULL) {
2965 2965                  pvn_write_done(pp, B_ERROR | B_WRITE | flags);
2966 2966          }
2967 2967  
2968 2968          if (offp) {
2969 2969                  *offp = io_off;
2970 2970          }
2971 2971          if (lenp) {
2972 2972                  *lenp = io_len;
2973 2973          }
2974 2974  
2975 2975          return (error);
2976 2976  }
2977 2977  
2978 2978  
2979 2979  int32_t
2980 2980  ud_iodone(struct buf *bp)
2981 2981  {
2982 2982          struct ud_inode *ip;
2983 2983  
2984 2984          ASSERT((bp->b_pages->p_vnode != NULL) && !(bp->b_flags & B_READ));
2985 2985  
2986 2986          bp->b_iodone = NULL;
2987 2987  
2988 2988          ip = VTOI(bp->b_pages->p_vnode);
2989 2989  
2990 2990          mutex_enter(&ip->i_tlock);
2991 2991          if (ip->i_writes >= ud_LW) {
2992 2992                  if ((ip->i_writes -= bp->b_bcount) <= ud_LW) {
2993 2993                          if (ud_WRITES) {
2994 2994                                  cv_broadcast(&ip->i_wrcv); /* wake all up */
2995 2995                          }
2996 2996                  }
2997 2997          } else {
2998 2998                  ip->i_writes -= bp->b_bcount;
2999 2999          }
3000 3000          mutex_exit(&ip->i_tlock);
3001 3001          iodone(bp);
3002 3002          return (0);
3003 3003  }
3004 3004  
3005 3005  /* ARGSUSED3 */
3006 3006  int32_t
3007 3007  ud_rdip(struct ud_inode *ip, struct uio *uio, int32_t ioflag, cred_t *cr)
3008 3008  {
3009 3009          struct vnode *vp;
3010 3010          struct udf_vfs *udf_vfsp;
3011 3011          krw_t rwtype;
3012 3012          caddr_t base;
3013 3013          uint32_t flags;
3014 3014          int32_t error, n, on, mapon, dofree;
3015 3015          u_offset_t off;
3016 3016          long oresid = uio->uio_resid;
3017 3017  
3018 3018          ASSERT(RW_LOCK_HELD(&ip->i_contents));
3019 3019          if ((ip->i_type != VREG) &&
3020 3020              (ip->i_type != VDIR) &&
3021 3021              (ip->i_type != VLNK)) {
3022 3022                  return (EIO);
3023 3023          }
3024 3024  
3025 3025          if (uio->uio_loffset > MAXOFFSET_T) {
3026 3026                  return (0);
3027 3027          }
3028 3028  
3029 3029          if ((uio->uio_loffset < (offset_t)0) ||
3030 3030              ((uio->uio_loffset + uio->uio_resid) < 0)) {
3031 3031                  return (EINVAL);
3032 3032          }
3033 3033          if (uio->uio_resid == 0) {
3034 3034                  return (0);
3035 3035          }
3036 3036  
3037 3037          vp = ITOV(ip);
3038 3038          udf_vfsp = ip->i_udf;
3039 3039          mutex_enter(&ip->i_tlock);
3040 3040          ip->i_flag |= IACC;
3041 3041          mutex_exit(&ip->i_tlock);
3042 3042  
3043 3043          rwtype = (rw_write_held(&ip->i_contents)?RW_WRITER:RW_READER);
3044 3044  
3045 3045          do {
3046 3046                  offset_t diff;
3047 3047                  u_offset_t uoff = uio->uio_loffset;
3048 3048                  off = uoff & (offset_t)MAXBMASK;
3049 3049                  mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3050 3050                  on = (int)blkoff(udf_vfsp, uoff);
3051 3051                  n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3052 3052  
3053 3053                  diff = ip->i_size - uoff;
3054 3054  
3055 3055                  if (diff <= (offset_t)0) {
3056 3056                          error = 0;
3057 3057                          goto out;
3058 3058                  }
3059 3059                  if (diff < (offset_t)n) {
3060 3060                          n = (int)diff;
3061 3061                  }
3062 3062                  dofree = ud_freebehind &&
3063 3063                      ip->i_nextr == (off & PAGEMASK) &&
3064 3064                      off > ud_smallfile;
3065 3065  
3066 3066  #ifndef __lock_lint
3067 3067                  if (rwtype == RW_READER) {
3068 3068                          rw_exit(&ip->i_contents);
3069 3069                  }
3070 3070  #endif
3071 3071  
3072 3072                  base = segmap_getmapflt(segkmap, vp, (off + mapon),
3073 3073                      (uint32_t)n, 1, S_READ);
3074 3074                  error = uiomove(base + mapon, (long)n, UIO_READ, uio);
3075 3075  
3076 3076                  flags = 0;
3077 3077                  if (!error) {
3078 3078                          /*
3079 3079                           * If read a whole block, or read to eof,
3080 3080                           * won't need this buffer again soon.
3081 3081                           */
3082 3082                          if (n + on == MAXBSIZE && ud_freebehind && dofree &&
3083 3083                              freemem < lotsfree + pages_before_pager) {
3084 3084                                  flags = SM_FREE | SM_DONTNEED |SM_ASYNC;
3085 3085                          }
3086 3086                          /*
3087 3087                           * In POSIX SYNC (FSYNC and FDSYNC) read mode,
3088 3088                           * we want to make sure that the page which has
3089 3089                           * been read, is written on disk if it is dirty.
3090 3090                           * And corresponding indirect blocks should also
3091 3091                           * be flushed out.
3092 3092                           */
3093 3093                          if ((ioflag & FRSYNC) && (ioflag & (FSYNC|FDSYNC))) {
3094 3094                                  flags &= ~SM_ASYNC;
3095 3095                                  flags |= SM_WRITE;
3096 3096                          }
3097 3097                          error = segmap_release(segkmap, base, flags);
3098 3098                  } else    {
3099 3099                          (void) segmap_release(segkmap, base, flags);
3100 3100                  }
3101 3101  
3102 3102  #ifndef __lock_lint
3103 3103                  if (rwtype == RW_READER) {
3104 3104                          rw_enter(&ip->i_contents, rwtype);
3105 3105                  }
3106 3106  #endif
3107 3107          } while (error == 0 && uio->uio_resid > 0 && n != 0);
3108 3108  out:
3109 3109          /*
3110 3110           * Inode is updated according to this table if FRSYNC is set.
3111 3111           *
3112 3112           *      FSYNC   FDSYNC(posix.4)
3113 3113           *      --------------------------
3114 3114           *      always  IATTCHG|IBDWRITE
3115 3115           */
3116 3116          if (ioflag & FRSYNC) {
3117 3117                  if ((ioflag & FSYNC) ||
3118 3118                      ((ioflag & FDSYNC) &&
3119 3119                      (ip->i_flag & (IATTCHG|IBDWRITE)))) {
3120 3120                  rw_exit(&ip->i_contents);
3121 3121                  rw_enter(&ip->i_contents, RW_WRITER);
3122 3122                  ud_iupdat(ip, 1);
3123 3123                  }
3124 3124          }
3125 3125          /*
3126 3126           * If we've already done a partial read, terminate
3127 3127           * the read but return no error.
3128 3128           */
3129 3129          if (oresid != uio->uio_resid) {
3130 3130                  error = 0;
3131 3131          }
3132 3132          ITIMES(ip);
3133 3133  
3134 3134          return (error);
3135 3135  }
3136 3136  
3137 3137  int32_t
3138 3138  ud_wrip(struct ud_inode *ip, struct uio *uio, int ioflag, struct cred *cr)
3139 3139  {
3140 3140          caddr_t base;
3141 3141          struct vnode *vp;
3142 3142          struct udf_vfs *udf_vfsp;
3143 3143          uint32_t flags;
3144 3144          int32_t error = 0, iupdat_flag, n, on, mapon, i_size_changed = 0;
3145 3145          int32_t pagecreate, newpage;
3146 3146          uint64_t old_i_size;
3147 3147          u_offset_t off;
3148 3148          long start_resid = uio->uio_resid, premove_resid;
3149 3149          rlim64_t limit = uio->uio_limit;
3150 3150  
3151 3151  
3152 3152          ASSERT(RW_WRITE_HELD(&ip->i_contents));
3153 3153          if ((ip->i_type != VREG) &&
3154 3154              (ip->i_type != VDIR) &&
3155 3155              (ip->i_type != VLNK)) {
3156 3156                  return (EIO);
3157 3157          }
3158 3158  
3159 3159          if (uio->uio_loffset >= MAXOFFSET_T) {
3160 3160                  return (EFBIG);
3161 3161          }
3162 3162          /*
3163 3163           * see udf_l_pathconf
3164 3164           */
3165 3165          if (limit > (((uint64_t)1 << 40) - 1)) {
3166 3166                  limit = ((uint64_t)1 << 40) - 1;
3167 3167          }
3168 3168          if (uio->uio_loffset >= limit) {
3169 3169                  proc_t *p = ttoproc(curthread);
3170 3170  
3171 3171                  mutex_enter(&p->p_lock);
3172 3172                  (void) rctl_action(rctlproc_legacy[RLIMIT_FSIZE], p->p_rctls,
3173 3173                      p, RCA_UNSAFE_SIGINFO);
3174 3174                  mutex_exit(&p->p_lock);
3175 3175                  return (EFBIG);
3176 3176          }
3177 3177          if ((uio->uio_loffset < (offset_t)0) ||
3178 3178              ((uio->uio_loffset + uio->uio_resid) < 0)) {
3179 3179                  return (EINVAL);
3180 3180          }
3181 3181          if (uio->uio_resid == 0) {
3182 3182                  return (0);
3183 3183          }
3184 3184  
3185 3185          mutex_enter(&ip->i_tlock);
3186 3186          ip->i_flag |= INOACC;
3187 3187  
3188 3188          if (ioflag & (FSYNC | FDSYNC)) {
3189 3189                  ip->i_flag |= ISYNC;
3190 3190                  iupdat_flag = 1;
3191 3191          }
3192 3192          mutex_exit(&ip->i_tlock);
3193 3193  
3194 3194          udf_vfsp = ip->i_udf;
3195 3195          vp = ITOV(ip);
3196 3196  
3197 3197          do {
3198 3198                  u_offset_t uoff = uio->uio_loffset;
3199 3199                  off = uoff & (offset_t)MAXBMASK;
3200 3200                  mapon = (int)(uoff & (offset_t)MAXBOFFSET);
3201 3201                  on = (int)blkoff(udf_vfsp, uoff);
3202 3202                  n = (int)MIN(udf_vfsp->udf_lbsize - on, uio->uio_resid);
3203 3203  
3204 3204                  if (ip->i_type == VREG && uoff + n >= limit) {
3205 3205                          if (uoff >= limit) {
3206 3206                                  error = EFBIG;
3207 3207                                  goto out;
3208 3208                          }
3209 3209                          n = (int)(limit - (rlim64_t)uoff);
3210 3210                  }
3211 3211                  if (uoff + n > ip->i_size) {
3212 3212                          /*
3213 3213                           * We are extending the length of the file.
3214 3214                           * bmap is used so that we are sure that
3215 3215                           * if we need to allocate new blocks, that it
3216 3216                           * is done here before we up the file size.
3217 3217                           */
3218 3218                          error = ud_bmap_write(ip, uoff,
3219 3219                              (int)(on + n), mapon == 0, cr);
3220 3220                          if (error) {
3221 3221                                  break;
3222 3222                          }
3223 3223                          i_size_changed = 1;
3224 3224                          old_i_size = ip->i_size;
3225 3225                          ip->i_size = uoff + n;
3226 3226                          /*
3227 3227                           * If we are writing from the beginning of
3228 3228                           * the mapping, we can just create the
3229 3229                           * pages without having to read them.
3230 3230                           */
3231 3231                          pagecreate = (mapon == 0);
3232 3232                  } else if (n == MAXBSIZE) {
3233 3233                          /*
3234 3234                           * Going to do a whole mappings worth,
3235 3235                           * so we can just create the pages w/o
3236 3236                           * having to read them in.  But before
3237 3237                           * we do that, we need to make sure any
3238 3238                           * needed blocks are allocated first.
3239 3239                           */
3240 3240                          error = ud_bmap_write(ip, uoff,
3241 3241                              (int)(on + n), 1, cr);
3242 3242                          if (error) {
3243 3243                                  break;
3244 3244                          }
3245 3245                          pagecreate = 1;
3246 3246                  } else {
3247 3247                          pagecreate = 0;
3248 3248                  }
3249 3249  
3250 3250                  rw_exit(&ip->i_contents);
3251 3251  
3252 3252                  /*
3253 3253                   * Touch the page and fault it in if it is not in
3254 3254                   * core before segmap_getmapflt can lock it. This
3255 3255                   * is to avoid the deadlock if the buffer is mapped
3256 3256                   * to the same file through mmap which we want to
3257 3257                   * write to.
3258 3258                   */
3259 3259                  uio_prefaultpages((long)n, uio);
3260 3260  
3261 3261                  base = segmap_getmapflt(segkmap, vp, (off + mapon),
3262 3262                      (uint32_t)n, !pagecreate, S_WRITE);
3263 3263  
3264 3264                  /*
3265 3265                   * segmap_pagecreate() returns 1 if it calls
3266 3266                   * page_create_va() to allocate any pages.
3267 3267                   */
3268 3268                  newpage = 0;
3269 3269                  if (pagecreate) {
3270 3270                          newpage = segmap_pagecreate(segkmap, base,
3271 3271                              (size_t)n, 0);
3272 3272                  }
3273 3273  
3274 3274                  premove_resid = uio->uio_resid;
3275 3275                  error = uiomove(base + mapon, (long)n, UIO_WRITE, uio);
3276 3276  
3277 3277                  if (pagecreate &&
3278 3278                      uio->uio_loffset < roundup(off + mapon + n, PAGESIZE)) {
3279 3279                          /*
3280 3280                           * We created pages w/o initializing them completely,
3281 3281                           * thus we need to zero the part that wasn't set up.
3282 3282                           * This happens on most EOF write cases and if
3283 3283                           * we had some sort of error during the uiomove.
3284 3284                           */
3285 3285                          int nzero, nmoved;
3286 3286  
3287 3287                          nmoved = (int)(uio->uio_loffset - (off + mapon));
3288 3288                          ASSERT(nmoved >= 0 && nmoved <= n);
3289 3289                          nzero = roundup(on + n, PAGESIZE) - nmoved;
3290 3290                          ASSERT(nzero > 0 && mapon + nmoved + nzero <= MAXBSIZE);
3291 3291                          (void) kzero(base + mapon + nmoved, (uint32_t)nzero);
3292 3292                  }
3293 3293  
3294 3294                  /*
3295 3295                   * Unlock the pages allocated by page_create_va()
3296 3296                   * in segmap_pagecreate()
3297 3297                   */
3298 3298                  if (newpage) {
3299 3299                          segmap_pageunlock(segkmap, base, (size_t)n, S_WRITE);
3300 3300                  }
3301 3301  
3302 3302                  if (error) {
3303 3303                          /*
3304 3304                           * If we failed on a write, we may have already
3305 3305                           * allocated file blocks as well as pages.  It's
3306 3306                           * hard to undo the block allocation, but we must
3307 3307                           * be sure to invalidate any pages that may have
3308 3308                           * been allocated.
3309 3309                           */
3310 3310                          (void) segmap_release(segkmap, base, SM_INVAL);
3311 3311                  } else {
3312 3312                          flags = 0;
3313 3313                          /*
3314 3314                           * Force write back for synchronous write cases.
3315 3315                           */
3316 3316                          if ((ioflag & (FSYNC|FDSYNC)) || ip->i_type == VDIR) {
3317 3317                                  /*
3318 3318                                   * If the sticky bit is set but the
3319 3319                                   * execute bit is not set, we do a
3320 3320                                   * synchronous write back and free
3321 3321                                   * the page when done.  We set up swap
3322 3322                                   * files to be handled this way to
3323 3323                                   * prevent servers from keeping around
3324 3324                                   * the client's swap pages too long.
3325 3325                                   * XXX - there ought to be a better way.
3326 3326                                   */
3327 3327                                  if (IS_SWAPVP(vp)) {
3328 3328                                          flags = SM_WRITE | SM_FREE |
3329 3329                                              SM_DONTNEED;
3330 3330                                          iupdat_flag = 0;
3331 3331                                  } else {
3332 3332                                          flags = SM_WRITE;
3333 3333                                  }
3334 3334                          } else if (((mapon + n) == MAXBSIZE) ||
3335 3335                              IS_SWAPVP(vp)) {
3336 3336                                  /*
3337 3337                                   * Have written a whole block.
3338 3338                                   * Start an asynchronous write and
3339 3339                                   * mark the buffer to indicate that
3340 3340                                   * it won't be needed again soon.
3341 3341                                   */
3342 3342                                  flags = SM_WRITE |SM_ASYNC | SM_DONTNEED;
3343 3343                          }
3344 3344                          error = segmap_release(segkmap, base, flags);
3345 3345  
3346 3346                          /*
3347 3347                           * If the operation failed and is synchronous,
3348 3348                           * then we need to unwind what uiomove() last
3349 3349                           * did so we can potentially return an error to
3350 3350                           * the caller.  If this write operation was
3351 3351                           * done in two pieces and the first succeeded,
3352 3352                           * then we won't return an error for the second
3353 3353                           * piece that failed.  However, we only want to
3354 3354                           * return a resid value that reflects what was
3355 3355                           * really done.
3356 3356                           *
3357 3357                           * Failures for non-synchronous operations can
3358 3358                           * be ignored since the page subsystem will
3359 3359                           * retry the operation until it succeeds or the
3360 3360                           * file system is unmounted.
3361 3361                           */
3362 3362                          if (error) {
3363 3363                                  if ((ioflag & (FSYNC | FDSYNC)) ||
3364 3364                                      ip->i_type == VDIR) {
3365 3365                                          uio->uio_resid = premove_resid;
3366 3366                                  } else {
3367 3367                                          error = 0;
3368 3368                                  }
3369 3369                          }
3370 3370                  }
3371 3371  
3372 3372                  /*
3373 3373                   * Re-acquire contents lock.
3374 3374                   */
3375 3375                  rw_enter(&ip->i_contents, RW_WRITER);
3376 3376                  /*
3377 3377                   * If the uiomove() failed or if a synchronous
3378 3378                   * page push failed, fix up i_size.
3379 3379                   */
3380 3380                  if (error) {
3381 3381                          if (i_size_changed) {
3382 3382                                  /*
3383 3383                                   * The uiomove failed, and we
3384 3384                                   * allocated blocks,so get rid
3385 3385                                   * of them.
3386 3386                                   */
3387 3387                                  (void) ud_itrunc(ip, old_i_size, 0, cr);
3388 3388                          }
3389 3389                  } else {
3390 3390                          /*
3391 3391                           * XXX - Can this be out of the loop?
3392 3392                           */
3393 3393                          ip->i_flag |= IUPD | ICHG;
3394 3394                          if (i_size_changed) {
3395 3395                                  ip->i_flag |= IATTCHG;
3396 3396                          }
3397 3397                          if ((ip->i_perm & (IEXEC | (IEXEC >> 5) |
3398 3398                              (IEXEC >> 10))) != 0 &&
3399 3399                              (ip->i_char & (ISUID | ISGID)) != 0 &&
3400 3400                              secpolicy_vnode_setid_retain(cr,
3401 3401                              (ip->i_char & ISUID) != 0 && ip->i_uid == 0) != 0) {
3402 3402                                  /*
3403 3403                                   * Clear Set-UID & Set-GID bits on
3404 3404                                   * successful write if not privileged
3405 3405                                   * and at least one of the execute bits
3406 3406                                   * is set.  If we always clear Set-GID,
3407 3407                                   * mandatory file and record locking is
3408 3408                                   * unuseable.
3409 3409                                   */
3410 3410                                  ip->i_char &= ~(ISUID | ISGID);
3411 3411                          }
3412 3412                  }
3413 3413          } while (error == 0 && uio->uio_resid > 0 && n != 0);
3414 3414  
3415 3415  out:
3416 3416          /*
3417 3417           * Inode is updated according to this table -
3418 3418           *
3419 3419           *      FSYNC   FDSYNC(posix.4)
3420 3420           *      --------------------------
3421 3421           *      always@ IATTCHG|IBDWRITE
3422 3422           *
3423 3423           * @ -  If we are doing synchronous write the only time we should
3424 3424           *      not be sync'ing the ip here is if we have the stickyhack
3425 3425           *      activated, the file is marked with the sticky bit and
3426 3426           *      no exec bit, the file length has not been changed and
3427 3427           *      no new blocks have been allocated during this write.
3428 3428           */
3429 3429          if ((ip->i_flag & ISYNC) != 0) {
3430 3430                  /*
3431 3431                   * we have eliminated nosync
3432 3432                   */
3433 3433                  if ((ip->i_flag & (IATTCHG|IBDWRITE)) ||
3434 3434                      ((ioflag & FSYNC) && iupdat_flag)) {
3435 3435                          ud_iupdat(ip, 1);
3436 3436                  }
3437 3437          }
3438 3438  
3439 3439          /*
3440 3440           * If we've already done a partial-write, terminate
3441 3441           * the write but return no error.
3442 3442           */
3443 3443          if (start_resid != uio->uio_resid) {
3444 3444                  error = 0;
3445 3445          }
3446 3446          ip->i_flag &= ~(INOACC | ISYNC);
3447 3447          ITIMES_NOLOCK(ip);
3448 3448  
3449 3449          return (error);
3450 3450  }
3451 3451  
3452 3452  int32_t
3453 3453  ud_multi_strat(struct ud_inode *ip,
3454 3454          page_t *pp, struct buf *bp, u_offset_t start)
3455 3455  {
3456 3456          daddr_t bn;
3457 3457          int32_t error = 0, io_count, contig, alloc_sz, i;
3458 3458          uint32_t io_off;
3459 3459          mio_master_t *mm = NULL;
3460 3460          mio_slave_t *ms = NULL;
3461 3461          struct buf *rbp;
3462 3462  
3463 3463          ASSERT(!(start & PAGEOFFSET));
3464 3464  
3465 3465          /*
3466 3466           * Figure out how many buffers to allocate
3467 3467           */
3468 3468          io_count = 0;
3469 3469          for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3470 3470                  contig = 0;
3471 3471                  if (error = ud_bmap_read(ip, (u_offset_t)(start + io_off),
3472 3472                      &bn, &contig)) {
3473 3473                          goto end;
3474 3474                  }
3475 3475                  if (contig == 0) {
3476 3476                          goto end;
3477 3477                  }
3478 3478                  contig = MIN(contig, PAGESIZE - io_off);
3479 3479                  if (bn != UDF_HOLE) {
3480 3480                          io_count ++;
3481 3481                  } else {
3482 3482                          /*
3483 3483                           * HOLE
3484 3484                           */
3485 3485                          if (bp->b_flags & B_READ) {
3486 3486  
3487 3487                                  /*
3488 3488                                   * This is a hole and is read
3489 3489                                   * it should be filled with 0's
3490 3490                                   */
3491 3491                                  pagezero(pp, io_off, contig);
3492 3492                          }
3493 3493                  }
3494 3494          }
3495 3495  
3496 3496  
3497 3497          if (io_count != 0) {
3498 3498  
3499 3499                  /*
3500 3500                   * Allocate memory for all the
3501 3501                   * required number of buffers
3502 3502                   */
3503 3503                  alloc_sz = sizeof (mio_master_t) +
3504 3504                      (sizeof (mio_slave_t) * io_count);
3505 3505                  mm = (mio_master_t *)kmem_zalloc(alloc_sz, KM_SLEEP);
3506 3506                  if (mm == NULL) {
3507 3507                          error = ENOMEM;
3508 3508                          goto end;
3509 3509                  }
3510 3510  
3511 3511                  /*
3512 3512                   * initialize master
3513 3513                   */
3514 3514                  mutex_init(&mm->mm_mutex, NULL, MUTEX_DEFAULT, NULL);
3515 3515                  mm->mm_size = alloc_sz;
3516 3516                  mm->mm_bp = bp;
3517 3517                  mm->mm_resid = 0;
3518 3518                  mm->mm_error = 0;
3519 3519                  mm->mm_index = master_index++;
3520 3520  
3521 3521                  ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3522 3522  
3523 3523                  /*
3524 3524                   * Initialize buffers
3525 3525                   */
3526 3526                  io_count = 0;
3527 3527                  for (io_off = 0; io_off < bp->b_bcount; io_off += contig) {
3528 3528                          contig = 0;
3529 3529                          if (error = ud_bmap_read(ip,
3530 3530                              (u_offset_t)(start + io_off),
3531 3531                              &bn, &contig)) {
3532 3532                                  goto end;
3533 3533                          }
3534 3534                          ASSERT(contig);
3535 3535                          if ((io_off + contig) > bp->b_bcount) {
3536 3536                                  contig = bp->b_bcount - io_off;
3537 3537                          }
3538 3538                          if (bn != UDF_HOLE) {
3539 3539                                  /*
3540 3540                                   * Clone the buffer
3541 3541                                   * and prepare to start I/O
3542 3542                                   */
3543 3543                                  ms->ms_ptr = mm;
3544 3544                                  bioinit(&ms->ms_buf);
3545 3545                                  rbp = bioclone(bp, io_off, (size_t)contig,
3546 3546                                      bp->b_edev, bn, ud_slave_done,
3547 3547                                      &ms->ms_buf, KM_NOSLEEP);
3548 3548                                  ASSERT(rbp == &ms->ms_buf);
3549 3549                                  mm->mm_resid += contig;
3550 3550                                  io_count++;
3551 3551                                  ms ++;
3552 3552                          }
3553 3553                  }
3554 3554  
3555 3555                  /*
3556 3556                   * Start I/O's
3557 3557                   */
3558 3558                  ms = (mio_slave_t *)(((caddr_t)mm) + sizeof (mio_master_t));
3559 3559                  for (i = 0; i < io_count; i++) {
3560 3560                          (void) bdev_strategy(&ms->ms_buf);
3561 3561                          ms ++;
3562 3562                  }
3563 3563          }
3564 3564  
3565 3565  end:
3566 3566          if (error != 0) {
3567 3567                  bp->b_flags |= B_ERROR;
3568 3568                  bp->b_error = error;
3569 3569                  if (mm != NULL) {
3570 3570                          mutex_destroy(&mm->mm_mutex);
3571 3571                          kmem_free(mm, mm->mm_size);
3572 3572                  }
3573 3573          }
3574 3574          return (error);
3575 3575  }
3576 3576  
3577 3577  int32_t
3578 3578  ud_slave_done(struct buf *bp)
3579 3579  {
3580 3580          mio_master_t *mm;
3581 3581          int32_t resid;
3582 3582  
3583 3583          ASSERT(SEMA_HELD(&bp->b_sem));
3584 3584          ASSERT((bp->b_flags & B_DONE) == 0);
3585 3585  
3586 3586          mm = ((mio_slave_t *)bp)->ms_ptr;
3587 3587  
3588 3588          /*
3589 3589           * Propagate error and byte count info from slave struct to
3590 3590           * the master struct
3591 3591           */
3592 3592          mutex_enter(&mm->mm_mutex);
3593 3593          if (bp->b_flags & B_ERROR) {
3594 3594  
3595 3595                  /*
3596 3596                   * If multiple slave buffers get
3597 3597                   * error we forget the old errors
3598 3598                   * this is ok because we any way
3599 3599                   * cannot return multiple errors
3600 3600                   */
3601 3601                  mm->mm_error = bp->b_error;
3602 3602          }
3603 3603          mm->mm_resid -= bp->b_bcount;
3604 3604          resid = mm->mm_resid;
3605 3605          mutex_exit(&mm->mm_mutex);
3606 3606  
3607 3607          /*
3608 3608           * free up the resources allocated to cloned buffers.
3609 3609           */
3610 3610          bp_mapout(bp);
3611 3611          biofini(bp);
3612 3612  
3613 3613          if (resid == 0) {
3614 3614  
3615 3615                  /*
3616 3616                   * This is the last I/O operation
3617 3617                   * clean up and return the original buffer
3618 3618                   */
3619 3619                  if (mm->mm_error) {
3620 3620                          mm->mm_bp->b_flags |= B_ERROR;
3621 3621                          mm->mm_bp->b_error = mm->mm_error;
3622 3622                  }
3623 3623                  biodone(mm->mm_bp);
3624 3624                  mutex_destroy(&mm->mm_mutex);
3625 3625                  kmem_free(mm, mm->mm_size);
3626 3626          }
3627 3627          return (0);
3628 3628  }
  
    | 
      ↓ open down ↓ | 
    3628 lines elided | 
    
      ↑ open up ↑ | 
  
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX