Print this page
NEX-5367 special vdev: sync-write options (NEW)
Reviewed by: Roman Strashkin <roman.strashkin@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4794 Write Back Cache sync and async writes: adjust routing according to watermark limits
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
6328 Fix cstyle errors in zfs codebase (fix studio)
6328 Fix cstyle errors in zfs codebase
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Richard Elling <Richard.Elling@RichardElling.com>
Reviewed by: Jorgen Lundman <lundman@lundman.net>
Approved by: Robert Mustacchi <rm@joyent.com>
Issues #7: Reconsile L2ARC and "special" use by datasets
re #12616 rb4051 zfs_log_write()/dmu_sync() write once to special refactoring
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/zfs_log.c
+++ new/usr/src/uts/common/fs/zfs/zfs_log.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright 2013 Nexenta Systems, Inc. All rights reserved.
23 24 * Copyright (c) 2015 by Delphix. All rights reserved.
24 25 * Copyright (c) 2014 Integros [integros.com]
25 26 */
26 27
27 28 #include <sys/types.h>
28 29 #include <sys/param.h>
29 30 #include <sys/systm.h>
30 31 #include <sys/sysmacros.h>
31 32 #include <sys/cmn_err.h>
32 33 #include <sys/kmem.h>
33 34 #include <sys/thread.h>
34 35 #include <sys/file.h>
35 36 #include <sys/vfs.h>
36 37 #include <sys/zfs_znode.h>
|
↓ open down ↓ |
4 lines elided |
↑ open up ↑ |
37 38 #include <sys/zfs_dir.h>
38 39 #include <sys/zil.h>
39 40 #include <sys/zil_impl.h>
40 41 #include <sys/byteorder.h>
41 42 #include <sys/policy.h>
42 43 #include <sys/stat.h>
43 44 #include <sys/mode.h>
44 45 #include <sys/acl.h>
45 46 #include <sys/dmu.h>
46 47 #include <sys/spa.h>
48 +#include <sys/spa_impl.h>
47 49 #include <sys/zfs_fuid.h>
48 50 #include <sys/ddi.h>
49 51 #include <sys/dsl_dataset.h>
52 +#include <sys/special.h>
50 53
51 54 /*
52 55 * These zfs_log_* functions must be called within a dmu tx, in one
53 56 * of 2 contexts depending on zilog->z_replay:
54 57 *
55 58 * Non replay mode
56 59 * ---------------
57 60 * We need to record the transaction so that if it is committed to
58 61 * the Intent Log then it can be replayed. An intent log transaction
59 62 * structure (itx_t) is allocated and all the information necessary to
60 63 * possibly replay the transaction is saved in it. The itx is then assigned
61 64 * a sequence number and inserted in the in-memory list anchored in the zilog.
62 65 *
63 66 * Replay mode
64 67 * -----------
65 68 * We need to mark the intent log record as replayed in the log header.
66 69 * This is done in the same transaction as the replay so that they
67 70 * commit atomically.
68 71 */
69 72
70 73 int
71 74 zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap)
72 75 {
73 76 int isxvattr = (vap->va_mask & AT_XVATTR);
74 77 switch (type) {
75 78 case Z_FILE:
76 79 if (vsecp == NULL && !isxvattr)
77 80 return (TX_CREATE);
78 81 if (vsecp && isxvattr)
79 82 return (TX_CREATE_ACL_ATTR);
80 83 if (vsecp)
81 84 return (TX_CREATE_ACL);
82 85 else
83 86 return (TX_CREATE_ATTR);
84 87 /*NOTREACHED*/
85 88 case Z_DIR:
86 89 if (vsecp == NULL && !isxvattr)
87 90 return (TX_MKDIR);
88 91 if (vsecp && isxvattr)
89 92 return (TX_MKDIR_ACL_ATTR);
90 93 if (vsecp)
91 94 return (TX_MKDIR_ACL);
92 95 else
93 96 return (TX_MKDIR_ATTR);
94 97 case Z_XATTRDIR:
95 98 return (TX_MKXATTR);
96 99 }
97 100 ASSERT(0);
98 101 return (TX_MAX_TYPE);
99 102 }
100 103
101 104 /*
102 105 * build up the log data necessary for logging xvattr_t
103 106 * First lr_attr_t is initialized. following the lr_attr_t
104 107 * is the mapsize and attribute bitmap copied from the xvattr_t.
105 108 * Following the bitmap and bitmapsize two 64 bit words are reserved
106 109 * for the create time which may be set. Following the create time
107 110 * records a single 64 bit integer which has the bits to set on
108 111 * replay for the xvattr.
109 112 */
110 113 static void
111 114 zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
112 115 {
113 116 uint32_t *bitmap;
114 117 uint64_t *attrs;
115 118 uint64_t *crtime;
116 119 xoptattr_t *xoap;
117 120 void *scanstamp;
118 121 int i;
119 122
120 123 xoap = xva_getxoptattr(xvap);
121 124 ASSERT(xoap);
122 125
123 126 lrattr->lr_attr_masksize = xvap->xva_mapsize;
124 127 bitmap = &lrattr->lr_attr_bitmap;
125 128 for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) {
126 129 *bitmap = xvap->xva_reqattrmap[i];
127 130 }
128 131
129 132 /* Now pack the attributes up in a single uint64_t */
130 133 attrs = (uint64_t *)bitmap;
131 134 crtime = attrs + 1;
132 135 scanstamp = (caddr_t)(crtime + 2);
133 136 *attrs = 0;
134 137 if (XVA_ISSET_REQ(xvap, XAT_READONLY))
135 138 *attrs |= (xoap->xoa_readonly == 0) ? 0 :
136 139 XAT0_READONLY;
137 140 if (XVA_ISSET_REQ(xvap, XAT_HIDDEN))
138 141 *attrs |= (xoap->xoa_hidden == 0) ? 0 :
139 142 XAT0_HIDDEN;
140 143 if (XVA_ISSET_REQ(xvap, XAT_SYSTEM))
141 144 *attrs |= (xoap->xoa_system == 0) ? 0 :
142 145 XAT0_SYSTEM;
143 146 if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE))
144 147 *attrs |= (xoap->xoa_archive == 0) ? 0 :
145 148 XAT0_ARCHIVE;
146 149 if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE))
147 150 *attrs |= (xoap->xoa_immutable == 0) ? 0 :
148 151 XAT0_IMMUTABLE;
149 152 if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK))
150 153 *attrs |= (xoap->xoa_nounlink == 0) ? 0 :
151 154 XAT0_NOUNLINK;
152 155 if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY))
153 156 *attrs |= (xoap->xoa_appendonly == 0) ? 0 :
154 157 XAT0_APPENDONLY;
155 158 if (XVA_ISSET_REQ(xvap, XAT_OPAQUE))
156 159 *attrs |= (xoap->xoa_opaque == 0) ? 0 :
157 160 XAT0_APPENDONLY;
158 161 if (XVA_ISSET_REQ(xvap, XAT_NODUMP))
159 162 *attrs |= (xoap->xoa_nodump == 0) ? 0 :
160 163 XAT0_NODUMP;
161 164 if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED))
162 165 *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 :
163 166 XAT0_AV_QUARANTINED;
164 167 if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED))
165 168 *attrs |= (xoap->xoa_av_modified == 0) ? 0 :
166 169 XAT0_AV_MODIFIED;
167 170 if (XVA_ISSET_REQ(xvap, XAT_CREATETIME))
168 171 ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
169 172 if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
170 173 bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
171 174 if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
172 175 *attrs |= (xoap->xoa_reparse == 0) ? 0 :
173 176 XAT0_REPARSE;
174 177 if (XVA_ISSET_REQ(xvap, XAT_OFFLINE))
175 178 *attrs |= (xoap->xoa_offline == 0) ? 0 :
176 179 XAT0_OFFLINE;
177 180 if (XVA_ISSET_REQ(xvap, XAT_SPARSE))
178 181 *attrs |= (xoap->xoa_sparse == 0) ? 0 :
179 182 XAT0_SPARSE;
180 183 }
181 184
182 185 static void *
183 186 zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start)
184 187 {
185 188 zfs_fuid_t *zfuid;
186 189 uint64_t *fuidloc = start;
187 190
188 191 /* First copy in the ACE FUIDs */
189 192 for (zfuid = list_head(&fuidp->z_fuids); zfuid;
190 193 zfuid = list_next(&fuidp->z_fuids, zfuid)) {
191 194 *fuidloc++ = zfuid->z_logfuid;
192 195 }
193 196 return (fuidloc);
194 197 }
195 198
196 199
197 200 static void *
198 201 zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start)
199 202 {
200 203 zfs_fuid_domain_t *zdomain;
201 204
202 205 /* now copy in the domain info, if any */
203 206 if (fuidp->z_domain_str_sz != 0) {
204 207 for (zdomain = list_head(&fuidp->z_domains); zdomain;
205 208 zdomain = list_next(&fuidp->z_domains, zdomain)) {
206 209 bcopy((void *)zdomain->z_domain, start,
207 210 strlen(zdomain->z_domain) + 1);
208 211 start = (caddr_t)start +
209 212 strlen(zdomain->z_domain) + 1;
210 213 }
211 214 }
212 215 return (start);
213 216 }
214 217
215 218 /*
216 219 * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and
217 220 * TK_MKXATTR transactions.
218 221 *
219 222 * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID
220 223 * domain information appended prior to the name. In this case the
221 224 * uid/gid in the log record will be a log centric FUID.
222 225 *
223 226 * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that
224 227 * may contain attributes, ACL and optional fuid information.
225 228 *
226 229 * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify
227 230 * and ACL and normal users/groups in the ACEs.
228 231 *
229 232 * There may be an optional xvattr attribute information similar
230 233 * to zfs_log_setattr.
231 234 *
232 235 * Also, after the file name "domain" strings may be appended.
233 236 */
234 237 void
235 238 zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
236 239 znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp,
237 240 zfs_fuid_info_t *fuidp, vattr_t *vap)
238 241 {
239 242 itx_t *itx;
240 243 lr_create_t *lr;
241 244 lr_acl_create_t *lracl;
242 245 size_t aclsize = (vsecp != NULL) ? vsecp->vsa_aclentsz : 0;
243 246 size_t xvatsize = 0;
244 247 size_t txsize;
245 248 xvattr_t *xvap = (xvattr_t *)vap;
246 249 void *end;
247 250 size_t lrsize;
248 251 size_t namesize = strlen(name) + 1;
249 252 size_t fuidsz = 0;
250 253
251 254 if (zil_replaying(zilog, tx))
252 255 return;
253 256
254 257 /*
255 258 * If we have FUIDs present then add in space for
256 259 * domains and ACE fuid's if any.
257 260 */
258 261 if (fuidp) {
259 262 fuidsz += fuidp->z_domain_str_sz;
260 263 fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t);
261 264 }
262 265
263 266 if (vap->va_mask & AT_XVATTR)
264 267 xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize);
265 268
266 269 if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR ||
267 270 (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR ||
268 271 (int)txtype == TX_MKXATTR) {
269 272 txsize = sizeof (*lr) + namesize + fuidsz + xvatsize;
270 273 lrsize = sizeof (*lr);
271 274 } else {
272 275 txsize =
273 276 sizeof (lr_acl_create_t) + namesize + fuidsz +
274 277 ZIL_ACE_LENGTH(aclsize) + xvatsize;
275 278 lrsize = sizeof (lr_acl_create_t);
276 279 }
277 280
278 281 itx = zil_itx_create(txtype, txsize);
279 282
280 283 lr = (lr_create_t *)&itx->itx_lr;
281 284 lr->lr_doid = dzp->z_id;
282 285 lr->lr_foid = zp->z_id;
283 286 lr->lr_mode = zp->z_mode;
284 287 if (!IS_EPHEMERAL(zp->z_uid)) {
285 288 lr->lr_uid = (uint64_t)zp->z_uid;
286 289 } else {
287 290 lr->lr_uid = fuidp->z_fuid_owner;
288 291 }
289 292 if (!IS_EPHEMERAL(zp->z_gid)) {
290 293 lr->lr_gid = (uint64_t)zp->z_gid;
291 294 } else {
292 295 lr->lr_gid = fuidp->z_fuid_group;
293 296 }
294 297 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
295 298 sizeof (uint64_t));
296 299 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
297 300 lr->lr_crtime, sizeof (uint64_t) * 2);
298 301
299 302 if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(zp->z_zfsvfs), &lr->lr_rdev,
300 303 sizeof (lr->lr_rdev)) != 0)
301 304 lr->lr_rdev = 0;
302 305
303 306 /*
304 307 * Fill in xvattr info if any
305 308 */
306 309 if (vap->va_mask & AT_XVATTR) {
307 310 zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap);
308 311 end = (caddr_t)lr + lrsize + xvatsize;
309 312 } else {
310 313 end = (caddr_t)lr + lrsize;
311 314 }
312 315
313 316 /* Now fill in any ACL info */
314 317
315 318 if (vsecp) {
316 319 lracl = (lr_acl_create_t *)&itx->itx_lr;
317 320 lracl->lr_aclcnt = vsecp->vsa_aclcnt;
318 321 lracl->lr_acl_bytes = aclsize;
319 322 lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
320 323 lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
321 324 if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS)
322 325 lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
323 326 else
324 327 lracl->lr_acl_flags = 0;
325 328
326 329 bcopy(vsecp->vsa_aclentp, end, aclsize);
327 330 end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize);
328 331 }
329 332
330 333 /* drop in FUID info */
331 334 if (fuidp) {
332 335 end = zfs_log_fuid_ids(fuidp, end);
333 336 end = zfs_log_fuid_domains(fuidp, end);
334 337 }
335 338 /*
336 339 * Now place file name in log record
337 340 */
338 341 bcopy(name, end, namesize);
339 342
340 343 zil_itx_assign(zilog, itx, tx);
341 344 }
342 345
343 346 /*
344 347 * Handles both TX_REMOVE and TX_RMDIR transactions.
345 348 */
346 349 void
347 350 zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
348 351 znode_t *dzp, char *name, uint64_t foid)
349 352 {
350 353 itx_t *itx;
351 354 lr_remove_t *lr;
352 355 size_t namesize = strlen(name) + 1;
353 356
354 357 if (zil_replaying(zilog, tx))
355 358 return;
356 359
357 360 itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
358 361 lr = (lr_remove_t *)&itx->itx_lr;
359 362 lr->lr_doid = dzp->z_id;
360 363 bcopy(name, (char *)(lr + 1), namesize);
361 364
362 365 itx->itx_oid = foid;
363 366
364 367 zil_itx_assign(zilog, itx, tx);
365 368 }
366 369
367 370 /*
368 371 * Handles TX_LINK transactions.
369 372 */
370 373 void
371 374 zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
372 375 znode_t *dzp, znode_t *zp, char *name)
373 376 {
374 377 itx_t *itx;
375 378 lr_link_t *lr;
376 379 size_t namesize = strlen(name) + 1;
377 380
378 381 if (zil_replaying(zilog, tx))
379 382 return;
380 383
381 384 itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
382 385 lr = (lr_link_t *)&itx->itx_lr;
383 386 lr->lr_doid = dzp->z_id;
384 387 lr->lr_link_obj = zp->z_id;
385 388 bcopy(name, (char *)(lr + 1), namesize);
386 389
387 390 zil_itx_assign(zilog, itx, tx);
388 391 }
389 392
390 393 /*
391 394 * Handles TX_SYMLINK transactions.
392 395 */
393 396 void
394 397 zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
395 398 znode_t *dzp, znode_t *zp, char *name, char *link)
396 399 {
397 400 itx_t *itx;
398 401 lr_create_t *lr;
399 402 size_t namesize = strlen(name) + 1;
400 403 size_t linksize = strlen(link) + 1;
401 404
402 405 if (zil_replaying(zilog, tx))
403 406 return;
404 407
405 408 itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
406 409 lr = (lr_create_t *)&itx->itx_lr;
407 410 lr->lr_doid = dzp->z_id;
408 411 lr->lr_foid = zp->z_id;
409 412 lr->lr_uid = zp->z_uid;
410 413 lr->lr_gid = zp->z_gid;
411 414 lr->lr_mode = zp->z_mode;
412 415 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zp->z_zfsvfs), &lr->lr_gen,
413 416 sizeof (uint64_t));
414 417 (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs),
415 418 lr->lr_crtime, sizeof (uint64_t) * 2);
416 419 bcopy(name, (char *)(lr + 1), namesize);
417 420 bcopy(link, (char *)(lr + 1) + namesize, linksize);
418 421
419 422 zil_itx_assign(zilog, itx, tx);
420 423 }
421 424
422 425 /*
423 426 * Handles TX_RENAME transactions.
424 427 */
425 428 void
426 429 zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
427 430 znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp)
428 431 {
429 432 itx_t *itx;
430 433 lr_rename_t *lr;
431 434 size_t snamesize = strlen(sname) + 1;
432 435 size_t dnamesize = strlen(dname) + 1;
433 436
434 437 if (zil_replaying(zilog, tx))
435 438 return;
436 439
437 440 itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
438 441 lr = (lr_rename_t *)&itx->itx_lr;
439 442 lr->lr_sdoid = sdzp->z_id;
440 443 lr->lr_tdoid = tdzp->z_id;
441 444 bcopy(sname, (char *)(lr + 1), snamesize);
442 445 bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize);
443 446 itx->itx_oid = szp->z_id;
444 447
445 448 zil_itx_assign(zilog, itx, tx);
446 449 }
|
↓ open down ↓ |
387 lines elided |
↑ open up ↑ |
447 450
448 451 /*
449 452 * Handles TX_WRITE transactions.
450 453 */
451 454 ssize_t zfs_immediate_write_sz = 32768;
452 455
453 456 void
454 457 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
455 458 znode_t *zp, offset_t off, ssize_t resid, int ioflag)
456 459 {
457 - uint32_t blocksize = zp->z_blksz;
460 + spa_t *spa = zilog->zl_spa;
461 + spa_meta_placement_t *mp = &spa->spa_meta_policy;
458 462 itx_wr_state_t write_state;
463 + boolean_t slogging, zil_to_special, write_to_special;
464 + size_t immediate_write_sz;
465 + uint32_t blocksize = zp->z_blksz;
459 466 uintptr_t fsync_cnt;
460 467
461 468 if (zil_replaying(zilog, tx) || zp->z_unlinked)
462 469 return;
463 470
464 - if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
471 + /*
472 + * Decide how to handle the write:
473 + * - WR_INDIRECT - synchronously write in zfs format, via dmu_sync()
474 + * - WR_COPIED - write to slog following the tx descriptor as
475 + * immediate data
476 + * - WR_NEED_COPY - copy out in the future (e.g. with next sync)
477 + *
478 + * Special vdevs are as fast as slogs - therefore a conservative
479 + * extension to the existing logic allows for the following
480 + * zpool-configurable options:
481 + *
482 + * (1) SYNC_TO_SPECIAL_DISABLED: do not use special vdev,
483 + * neither for zil, nor for WR_INDIRECT
484 + * (2) SYNC_TO_SPECIAL_STANDARD (default): use special vdev
485 + * exactly like slog
486 + * The remaining two options add the capability to sync data to
487 + * special vdev:
488 + * (3) SYNC_TO_SPECIAL_BALANCED: same as "standard", plus
489 + * load balance writes to the special vdev
490 + * (4) SYNC_TO_SPECIAL_ALWAYS: same as "standard" plus always
491 + * write to the special vdev
492 + *
493 + * Presence of special vdev has no affect if slog is configured:
494 + * the latter indicates that user expects conventional zfs
495 + * sync-write behavior.
496 + */
497 +
498 + immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
499 + ? 0 : zfs_immediate_write_sz;
500 +
501 + /* use special only if all of the following is true */
502 + zil_to_special = !spa_has_slogs(spa) &&
503 + spa_can_special_be_used(spa) &&
504 + mp->spa_sync_to_special != SYNC_TO_SPECIAL_DISABLED;
505 +
506 + /*
507 + * synchronously write data to special in zfs format - the
508 + * WR_INDIRECT case
509 + *
510 + * for the "balanced" option distribute the load based on the
511 + * special-to-normal ratio - the value that is periodically
512 + * recomputed by the load balancer implementing one of
513 + * SPA_SPECIAL_SELECTION_LATENCY etc. strategies
514 + */
515 + write_to_special = !spa_has_slogs(spa) &&
516 + spa_write_data_to_special(spa, zilog->zl_os) &&
517 + (mp->spa_sync_to_special == SYNC_TO_SPECIAL_ALWAYS ||
518 + (mp->spa_sync_to_special == SYNC_TO_SPECIAL_BALANCED &&
519 + spa->spa_avg_stat_rotor % 100 < spa->spa_special_to_normal_ratio));
520 +
521 + slogging = (spa_has_slogs(spa) || zil_to_special) &&
522 + zilog->zl_logbias == ZFS_LOGBIAS_LATENCY;
523 +
524 + if (resid > immediate_write_sz && !slogging && resid <= blocksize)
465 525 write_state = WR_INDIRECT;
526 + else if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
527 + write_state = WR_INDIRECT;
466 528 else if (!spa_has_slogs(zilog->zl_spa) &&
467 529 resid >= zfs_immediate_write_sz)
468 530 write_state = WR_INDIRECT;
531 + else if (write_to_special)
532 + write_state = WR_INDIRECT;
469 533 else if (ioflag & (FSYNC | FDSYNC))
470 534 write_state = WR_COPIED;
471 535 else
472 536 write_state = WR_NEED_COPY;
473 537
538 + DTRACE_PROBE3(zfs_lwr, ssize_t, immediate_write_sz,
539 + itx_wr_state_t, write_state, uint_t, zp->z_blksz);
540 +
474 541 if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) {
475 542 (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1));
476 543 }
477 544
478 545 while (resid) {
479 546 itx_t *itx;
480 547 lr_write_t *lr;
481 548 itx_wr_state_t wr_state = write_state;
482 549 ssize_t len = resid;
483 550
484 551 if (wr_state == WR_COPIED && resid > ZIL_MAX_COPIED_DATA)
485 552 wr_state = WR_NEED_COPY;
486 553 else if (wr_state == WR_INDIRECT)
487 554 len = MIN(blocksize - P2PHASE(off, blocksize), resid);
488 555
489 556 itx = zil_itx_create(txtype, sizeof (*lr) +
490 557 (wr_state == WR_COPIED ? len : 0));
491 558 lr = (lr_write_t *)&itx->itx_lr;
492 559 if (wr_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
493 560 zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
494 561 zil_itx_destroy(itx);
495 562 itx = zil_itx_create(txtype, sizeof (*lr));
496 563 lr = (lr_write_t *)&itx->itx_lr;
497 564 wr_state = WR_NEED_COPY;
498 565 }
499 566
500 567 itx->itx_wr_state = wr_state;
501 568 lr->lr_foid = zp->z_id;
502 569 lr->lr_offset = off;
503 570 lr->lr_length = len;
504 571 lr->lr_blkoff = 0;
505 572 BP_ZERO(&lr->lr_blkptr);
506 573
507 574 itx->itx_private = zp->z_zfsvfs;
508 575
509 576 if (!(ioflag & (FSYNC | FDSYNC)) && (zp->z_sync_cnt == 0) &&
510 577 (fsync_cnt == 0))
511 578 itx->itx_sync = B_FALSE;
512 579
513 580 zil_itx_assign(zilog, itx, tx);
514 581
515 582 off += len;
516 583 resid -= len;
517 584 }
518 585 }
519 586
520 587 /*
521 588 * Handles TX_TRUNCATE transactions.
522 589 */
523 590 void
524 591 zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
525 592 znode_t *zp, uint64_t off, uint64_t len)
526 593 {
527 594 itx_t *itx;
528 595 lr_truncate_t *lr;
529 596
530 597 if (zil_replaying(zilog, tx) || zp->z_unlinked)
531 598 return;
532 599
533 600 itx = zil_itx_create(txtype, sizeof (*lr));
534 601 lr = (lr_truncate_t *)&itx->itx_lr;
535 602 lr->lr_foid = zp->z_id;
536 603 lr->lr_offset = off;
537 604 lr->lr_length = len;
538 605
539 606 itx->itx_sync = (zp->z_sync_cnt != 0);
540 607 zil_itx_assign(zilog, itx, tx);
541 608 }
542 609
543 610 /*
544 611 * Handles TX_SETATTR transactions.
545 612 */
546 613 void
547 614 zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
548 615 znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp)
549 616 {
550 617 itx_t *itx;
551 618 lr_setattr_t *lr;
552 619 xvattr_t *xvap = (xvattr_t *)vap;
553 620 size_t recsize = sizeof (lr_setattr_t);
554 621 void *start;
555 622
556 623 if (zil_replaying(zilog, tx) || zp->z_unlinked)
557 624 return;
558 625
559 626 /*
560 627 * If XVATTR set, then log record size needs to allow
561 628 * for lr_attr_t + xvattr mask, mapsize and create time
562 629 * plus actual attribute values
563 630 */
564 631 if (vap->va_mask & AT_XVATTR)
565 632 recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize);
566 633
567 634 if (fuidp)
568 635 recsize += fuidp->z_domain_str_sz;
569 636
570 637 itx = zil_itx_create(txtype, recsize);
571 638 lr = (lr_setattr_t *)&itx->itx_lr;
572 639 lr->lr_foid = zp->z_id;
573 640 lr->lr_mask = (uint64_t)mask_applied;
574 641 lr->lr_mode = (uint64_t)vap->va_mode;
575 642 if ((mask_applied & AT_UID) && IS_EPHEMERAL(vap->va_uid))
576 643 lr->lr_uid = fuidp->z_fuid_owner;
577 644 else
578 645 lr->lr_uid = (uint64_t)vap->va_uid;
579 646
580 647 if ((mask_applied & AT_GID) && IS_EPHEMERAL(vap->va_gid))
581 648 lr->lr_gid = fuidp->z_fuid_group;
582 649 else
583 650 lr->lr_gid = (uint64_t)vap->va_gid;
584 651
585 652 lr->lr_size = (uint64_t)vap->va_size;
586 653 ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime);
587 654 ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime);
588 655 start = (lr_setattr_t *)(lr + 1);
589 656 if (vap->va_mask & AT_XVATTR) {
590 657 zfs_log_xvattr((lr_attr_t *)start, xvap);
591 658 start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize);
592 659 }
593 660
594 661 /*
595 662 * Now stick on domain information if any on end
596 663 */
597 664
598 665 if (fuidp)
599 666 (void) zfs_log_fuid_domains(fuidp, start);
600 667
601 668 itx->itx_sync = (zp->z_sync_cnt != 0);
602 669 zil_itx_assign(zilog, itx, tx);
603 670 }
604 671
605 672 /*
606 673 * Handles TX_ACL transactions.
607 674 */
608 675 void
609 676 zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
610 677 vsecattr_t *vsecp, zfs_fuid_info_t *fuidp)
611 678 {
612 679 itx_t *itx;
613 680 lr_acl_v0_t *lrv0;
614 681 lr_acl_t *lr;
615 682 int txtype;
616 683 int lrsize;
617 684 size_t txsize;
618 685 size_t aclbytes = vsecp->vsa_aclentsz;
619 686
620 687 if (zil_replaying(zilog, tx) || zp->z_unlinked)
621 688 return;
622 689
623 690 txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
624 691 TX_ACL_V0 : TX_ACL;
625 692
626 693 if (txtype == TX_ACL)
627 694 lrsize = sizeof (*lr);
628 695 else
629 696 lrsize = sizeof (*lrv0);
630 697
631 698 txsize = lrsize +
632 699 ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) +
633 700 (fuidp ? fuidp->z_domain_str_sz : 0) +
634 701 sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0);
635 702
636 703 itx = zil_itx_create(txtype, txsize);
637 704
638 705 lr = (lr_acl_t *)&itx->itx_lr;
639 706 lr->lr_foid = zp->z_id;
640 707 if (txtype == TX_ACL) {
641 708 lr->lr_acl_bytes = aclbytes;
642 709 lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0;
643 710 lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0;
644 711 if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS)
645 712 lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags;
646 713 else
647 714 lr->lr_acl_flags = 0;
648 715 }
649 716 lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt;
650 717
651 718 if (txtype == TX_ACL_V0) {
652 719 lrv0 = (lr_acl_v0_t *)lr;
653 720 bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes);
654 721 } else {
655 722 void *start = (ace_t *)(lr + 1);
656 723
657 724 bcopy(vsecp->vsa_aclentp, start, aclbytes);
658 725
659 726 start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes);
660 727
661 728 if (fuidp) {
662 729 start = zfs_log_fuid_ids(fuidp, start);
663 730 (void) zfs_log_fuid_domains(fuidp, start);
664 731 }
665 732 }
666 733
667 734 itx->itx_sync = (zp->z_sync_cnt != 0);
668 735 zil_itx_assign(zilog, itx, tx);
669 736 }
|
↓ open down ↓ |
186 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX