Print this page
NEX-20218 Backport Illumos #9474 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
MFV illumos-gate@fa41d87de9ec9000964c605eb01d6dc19e4a1abe
9464 txg_kick() fails to see that we are quiescing, forcing transactions to their next stages without leaving them accumulate changes
Reviewed by: Matt Ahrens <matt@delphix.com>
Reviewed by: Brad Lewis <brad.lewis@delphix.com>
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Approved by: Dan McDonald <danmcd@joyent.com>
NEX-6859 TX-commit callback that is registered in sync-ctx causes system panic
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
NEX-5795 Rename 'wrc' as 'wbc' in the source and in the tech docs
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
Reviewed by: Sanjay Nadkarni <sanjay.nadkarni@nexenta.com>
Reviewed by: Alek Pinchuk <alek.pinchuk@nexenta.com>
NEX-4582 update wrc test cases for allow to use write back cache per tree of datasets
Reviewed by: Steve Peng <steve.peng@nexenta.com>
Reviewed by: Alex Aizman <alex.aizman@nexenta.com>
5960 zfs recv should prefetch indirect blocks
5925 zfs receive -o origin=
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
5911 ZFS "hangs" while deleting file
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
NEX-1823 Slow performance doing of a large dataset
5911 ZFS "hangs" while deleting file
Reviewed by: Saso Kiselkov <saso.kiselkov@nexenta.com>
Reviewed by: Bayard Bell <bayard.bell@nexenta.com>
Moved closed ZFS files to open repo, changed Makefiles accordingly
Removed unneeded weak symbols
re #12585 rb4049 ZFS++ work port - refactoring to improve separation of open/closed code, bug fixes, performance improvements - open code
Bug 11205: add missing libzfs_closed_stubs.c to fix opensource-only build.
ZFS plus work: special vdevs, cos, cos/vdev properties
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dmu_tx.c
+++ new/usr/src/uts/common/fs/zfs/dmu_tx.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 - * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
23 + * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
24 24 * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
25 25 * Copyright (c) 2014 Integros [integros.com]
26 26 */
27 27
28 28 #include <sys/dmu.h>
29 29 #include <sys/dmu_impl.h>
30 30 #include <sys/dbuf.h>
31 31 #include <sys/dmu_tx.h>
32 32 #include <sys/dmu_objset.h>
33 33 #include <sys/dsl_dataset.h>
34 34 #include <sys/dsl_dir.h>
35 35 #include <sys/dsl_pool.h>
36 36 #include <sys/zap_impl.h>
37 37 #include <sys/spa.h>
38 38 #include <sys/sa.h>
39 39 #include <sys/sa_impl.h>
40 40 #include <sys/zfs_context.h>
41 41 #include <sys/varargs.h>
42 42
43 43 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
44 44 uint64_t arg1, uint64_t arg2);
45 45
46 46
47 47 dmu_tx_t *
48 48 dmu_tx_create_dd(dsl_dir_t *dd)
49 49 {
50 50 dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
51 51 tx->tx_dir = dd;
52 52 if (dd != NULL)
53 53 tx->tx_pool = dd->dd_pool;
54 54 list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
55 55 offsetof(dmu_tx_hold_t, txh_node));
56 56 list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
57 57 offsetof(dmu_tx_callback_t, dcb_node));
58 58 tx->tx_start = gethrtime();
59 59 return (tx);
60 60 }
61 61
62 62 dmu_tx_t *
63 63 dmu_tx_create(objset_t *os)
64 64 {
65 65 dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
66 66 tx->tx_objset = os;
67 67 return (tx);
68 68 }
69 69
70 70 dmu_tx_t *
71 71 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
72 72 {
73 73 dmu_tx_t *tx = dmu_tx_create_dd(NULL);
74 74
75 75 txg_verify(dp->dp_spa, txg);
76 76 tx->tx_pool = dp;
77 77 tx->tx_txg = txg;
78 78 tx->tx_anyobj = TRUE;
79 79
80 80 return (tx);
81 81 }
82 82
83 83 int
84 84 dmu_tx_is_syncing(dmu_tx_t *tx)
85 85 {
86 86 return (tx->tx_anyobj);
87 87 }
88 88
89 89 int
90 90 dmu_tx_private_ok(dmu_tx_t *tx)
91 91 {
92 92 return (tx->tx_anyobj);
93 93 }
94 94
95 95 static dmu_tx_hold_t *
96 96 dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
97 97 uint64_t arg1, uint64_t arg2)
98 98 {
99 99 dmu_tx_hold_t *txh;
100 100
101 101 if (dn != NULL) {
102 102 (void) refcount_add(&dn->dn_holds, tx);
103 103 if (tx->tx_txg != 0) {
104 104 mutex_enter(&dn->dn_mtx);
105 105 /*
106 106 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
107 107 * problem, but there's no way for it to happen (for
108 108 * now, at least).
109 109 */
110 110 ASSERT(dn->dn_assigned_txg == 0);
111 111 dn->dn_assigned_txg = tx->tx_txg;
112 112 (void) refcount_add(&dn->dn_tx_holds, tx);
113 113 mutex_exit(&dn->dn_mtx);
114 114 }
115 115 }
116 116
117 117 txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
118 118 txh->txh_tx = tx;
119 119 txh->txh_dnode = dn;
120 120 refcount_create(&txh->txh_space_towrite);
121 121 refcount_create(&txh->txh_memory_tohold);
122 122 txh->txh_type = type;
123 123 txh->txh_arg1 = arg1;
124 124 txh->txh_arg2 = arg2;
125 125 list_insert_tail(&tx->tx_holds, txh);
126 126
127 127 return (txh);
128 128 }
129 129
130 130 static dmu_tx_hold_t *
131 131 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
132 132 enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
133 133 {
134 134 dnode_t *dn = NULL;
135 135 dmu_tx_hold_t *txh;
136 136 int err;
137 137
138 138 if (object != DMU_NEW_OBJECT) {
139 139 err = dnode_hold(os, object, FTAG, &dn);
140 140 if (err != 0) {
141 141 tx->tx_err = err;
142 142 return (NULL);
143 143 }
144 144 }
145 145 txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
146 146 if (dn != NULL)
147 147 dnode_rele(dn, FTAG);
148 148 return (txh);
149 149 }
150 150
151 151 void
152 152 dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
153 153 {
154 154 /*
155 155 * If we're syncing, they can manipulate any object anyhow, and
156 156 * the hold on the dnode_t can cause problems.
157 157 */
158 158 if (!dmu_tx_is_syncing(tx))
159 159 (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
160 160 }
161 161
162 162 /*
163 163 * This function reads specified data from disk. The specified data will
164 164 * be needed to perform the transaction -- i.e, it will be read after
165 165 * we do dmu_tx_assign(). There are two reasons that we read the data now
166 166 * (before dmu_tx_assign()):
167 167 *
168 168 * 1. Reading it now has potentially better performance. The transaction
169 169 * has not yet been assigned, so the TXG is not held open, and also the
170 170 * caller typically has less locks held when calling dmu_tx_hold_*() than
171 171 * after the transaction has been assigned. This reduces the lock (and txg)
172 172 * hold times, thus reducing lock contention.
173 173 *
174 174 * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
175 175 * that are detected before they start making changes to the DMU state
176 176 * (i.e. now). Once the transaction has been assigned, and some DMU
177 177 * state has been changed, it can be difficult to recover from an i/o
178 178 * error (e.g. to undo the changes already made in memory at the DMU
179 179 * layer). Typically code to do so does not exist in the caller -- it
180 180 * assumes that the data has already been cached and thus i/o errors are
181 181 * not possible.
182 182 *
183 183 * It has been observed that the i/o initiated here can be a performance
184 184 * problem, and it appears to be optional, because we don't look at the
185 185 * data which is read. However, removing this read would only serve to
186 186 * move the work elsewhere (after the dmu_tx_assign()), where it may
187 187 * have a greater impact on performance (in addition to the impact on
188 188 * fault tolerance noted above).
189 189 */
190 190 static int
191 191 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
192 192 {
193 193 int err;
194 194 dmu_buf_impl_t *db;
195 195
196 196 rw_enter(&dn->dn_struct_rwlock, RW_READER);
197 197 db = dbuf_hold_level(dn, level, blkid, FTAG);
198 198 rw_exit(&dn->dn_struct_rwlock);
199 199 if (db == NULL)
200 200 return (SET_ERROR(EIO));
201 201 err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
202 202 dbuf_rele(db, FTAG);
203 203 return (err);
204 204 }
205 205
206 206 /* ARGSUSED */
207 207 static void
208 208 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
209 209 {
210 210 dnode_t *dn = txh->txh_dnode;
211 211 int err = 0;
212 212
213 213 if (len == 0)
214 214 return;
215 215
216 216 (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
217 217
218 218 if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
219 219 err = SET_ERROR(EFBIG);
220 220
221 221 if (dn == NULL)
222 222 return;
223 223
224 224 /*
225 225 * For i/o error checking, read the blocks that will be needed
226 226 * to perform the write: the first and last level-0 blocks (if
227 227 * they are not aligned, i.e. if they are partial-block writes),
228 228 * and all the level-1 blocks.
229 229 */
230 230 if (dn->dn_maxblkid == 0) {
231 231 if (off < dn->dn_datablksz &&
232 232 (off > 0 || len < dn->dn_datablksz)) {
233 233 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
234 234 if (err != 0) {
235 235 txh->txh_tx->tx_err = err;
236 236 }
237 237 }
238 238 } else {
239 239 zio_t *zio = zio_root(dn->dn_objset->os_spa,
240 240 NULL, NULL, ZIO_FLAG_CANFAIL);
241 241
242 242 /* first level-0 block */
243 243 uint64_t start = off >> dn->dn_datablkshift;
244 244 if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
245 245 err = dmu_tx_check_ioerr(zio, dn, 0, start);
246 246 if (err != 0) {
247 247 txh->txh_tx->tx_err = err;
248 248 }
249 249 }
250 250
251 251 /* last level-0 block */
252 252 uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
253 253 if (end != start && end <= dn->dn_maxblkid &&
254 254 P2PHASE(off + len, dn->dn_datablksz)) {
255 255 err = dmu_tx_check_ioerr(zio, dn, 0, end);
256 256 if (err != 0) {
257 257 txh->txh_tx->tx_err = err;
258 258 }
259 259 }
260 260
261 261 /* level-1 blocks */
262 262 if (dn->dn_nlevels > 1) {
263 263 int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
264 264 for (uint64_t i = (start >> shft) + 1;
265 265 i < end >> shft; i++) {
266 266 err = dmu_tx_check_ioerr(zio, dn, 1, i);
267 267 if (err != 0) {
268 268 txh->txh_tx->tx_err = err;
269 269 }
270 270 }
271 271 }
272 272
273 273 err = zio_wait(zio);
274 274 if (err != 0) {
275 275 txh->txh_tx->tx_err = err;
276 276 }
277 277 }
278 278 }
279 279
280 280 static void
281 281 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
282 282 {
283 283 (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
284 284 }
285 285
286 286 void
287 287 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
288 288 {
289 289 dmu_tx_hold_t *txh;
290 290
291 291 ASSERT0(tx->tx_txg);
292 292 ASSERT3U(len, <=, DMU_MAX_ACCESS);
293 293 ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
|
↓ open down ↓ |
260 lines elided |
↑ open up ↑ |
294 294
295 295 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
296 296 object, THT_WRITE, off, len);
297 297 if (txh != NULL) {
298 298 dmu_tx_count_write(txh, off, len);
299 299 dmu_tx_count_dnode(txh);
300 300 }
301 301 }
302 302
303 303 void
304 -dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
305 -{
306 - dmu_tx_hold_t *txh;
307 -
308 - ASSERT(tx->tx_txg == 0);
309 - txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
310 - object, THT_WRITE, 0, 0);
311 - if (txh == NULL)
312 - return;
313 -
314 - dnode_t *dn = txh->txh_dnode;
315 - (void) refcount_add_many(&txh->txh_space_towrite,
316 - 1ULL << dn->dn_indblkshift, FTAG);
317 - dmu_tx_count_dnode(txh);
318 -}
319 -
320 -void
321 304 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
322 305 {
323 306 dmu_tx_hold_t *txh;
324 307
325 308 ASSERT0(tx->tx_txg);
326 309 ASSERT3U(len, <=, DMU_MAX_ACCESS);
327 310 ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
328 311
329 312 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
330 313 if (txh != NULL) {
331 314 dmu_tx_count_write(txh, off, len);
332 315 dmu_tx_count_dnode(txh);
333 316 }
334 317 }
335 318
336 319 /*
337 320 * This function marks the transaction as being a "net free". The end
338 321 * result is that refquotas will be disabled for this transaction, and
339 322 * this transaction will be able to use half of the pool space overhead
340 323 * (see dsl_pool_adjustedsize()). Therefore this function should only
341 324 * be called for transactions that we expect will not cause a net increase
342 325 * in the amount of space used (but it's OK if that is occasionally not true).
343 326 */
344 327 void
345 328 dmu_tx_mark_netfree(dmu_tx_t *tx)
346 329 {
347 330 tx->tx_netfree = B_TRUE;
348 331 }
349 332
350 333 static void
351 334 dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
352 335 {
353 336 dmu_tx_t *tx;
354 337 dnode_t *dn;
355 338 int err;
356 339
357 340 tx = txh->txh_tx;
358 341 ASSERT(tx->tx_txg == 0);
359 342
360 343 dn = txh->txh_dnode;
361 344 dmu_tx_count_dnode(txh);
362 345
363 346 if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
364 347 return;
365 348 if (len == DMU_OBJECT_END)
366 349 len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
367 350
368 351 /*
369 352 * For i/o error checking, we read the first and last level-0
370 353 * blocks if they are not aligned, and all the level-1 blocks.
371 354 *
372 355 * Note: dbuf_free_range() assumes that we have not instantiated
373 356 * any level-0 dbufs that will be completely freed. Therefore we must
374 357 * exercise care to not read or count the first and last blocks
375 358 * if they are blocksize-aligned.
376 359 */
377 360 if (dn->dn_datablkshift == 0) {
378 361 if (off != 0 || len < dn->dn_datablksz)
379 362 dmu_tx_count_write(txh, 0, dn->dn_datablksz);
380 363 } else {
381 364 /* first block will be modified if it is not aligned */
382 365 if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
383 366 dmu_tx_count_write(txh, off, 1);
384 367 /* last block will be modified if it is not aligned */
385 368 if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
386 369 dmu_tx_count_write(txh, off + len, 1);
387 370 }
388 371
389 372 /*
390 373 * Check level-1 blocks.
391 374 */
392 375 if (dn->dn_nlevels > 1) {
393 376 int shift = dn->dn_datablkshift + dn->dn_indblkshift -
394 377 SPA_BLKPTRSHIFT;
395 378 uint64_t start = off >> shift;
396 379 uint64_t end = (off + len) >> shift;
397 380
398 381 ASSERT(dn->dn_indblkshift != 0);
399 382
400 383 /*
401 384 * dnode_reallocate() can result in an object with indirect
402 385 * blocks having an odd data block size. In this case,
403 386 * just check the single block.
404 387 */
405 388 if (dn->dn_datablkshift == 0)
406 389 start = end = 0;
407 390
408 391 zio_t *zio = zio_root(tx->tx_pool->dp_spa,
409 392 NULL, NULL, ZIO_FLAG_CANFAIL);
410 393 for (uint64_t i = start; i <= end; i++) {
411 394 uint64_t ibyte = i << shift;
412 395 err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
413 396 i = ibyte >> shift;
414 397 if (err == ESRCH || i > end)
415 398 break;
416 399 if (err != 0) {
417 400 tx->tx_err = err;
418 401 (void) zio_wait(zio);
419 402 return;
420 403 }
421 404
422 405 (void) refcount_add_many(&txh->txh_memory_tohold,
423 406 1 << dn->dn_indblkshift, FTAG);
424 407
425 408 err = dmu_tx_check_ioerr(zio, dn, 1, i);
426 409 if (err != 0) {
427 410 tx->tx_err = err;
428 411 (void) zio_wait(zio);
429 412 return;
430 413 }
431 414 }
432 415 err = zio_wait(zio);
433 416 if (err != 0) {
434 417 tx->tx_err = err;
435 418 return;
436 419 }
437 420 }
438 421 }
439 422
440 423 void
441 424 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
442 425 {
443 426 dmu_tx_hold_t *txh;
444 427
445 428 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
446 429 object, THT_FREE, off, len);
447 430 if (txh != NULL)
448 431 (void) dmu_tx_hold_free_impl(txh, off, len);
449 432 }
450 433
451 434 void
452 435 dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
453 436 {
454 437 dmu_tx_hold_t *txh;
455 438
456 439 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
457 440 if (txh != NULL)
458 441 (void) dmu_tx_hold_free_impl(txh, off, len);
459 442 }
460 443
461 444 static void
462 445 dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
463 446 {
464 447 dmu_tx_t *tx = txh->txh_tx;
465 448 dnode_t *dn;
466 449 int err;
467 450
468 451 ASSERT(tx->tx_txg == 0);
469 452
470 453 dn = txh->txh_dnode;
471 454
472 455 dmu_tx_count_dnode(txh);
473 456
474 457 /*
475 458 * Modifying a almost-full microzap is around the worst case (128KB)
476 459 *
477 460 * If it is a fat zap, the worst case would be 7*16KB=112KB:
478 461 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
479 462 * - 4 new blocks written if adding:
480 463 * - 2 blocks for possibly split leaves,
481 464 * - 2 grown ptrtbl blocks
482 465 */
483 466 (void) refcount_add_many(&txh->txh_space_towrite,
484 467 MZAP_MAX_BLKSZ, FTAG);
485 468
486 469 if (dn == NULL)
487 470 return;
488 471
489 472 ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
490 473
491 474 if (dn->dn_maxblkid == 0 || name == NULL) {
492 475 /*
493 476 * This is a microzap (only one block), or we don't know
494 477 * the name. Check the first block for i/o errors.
495 478 */
496 479 err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
497 480 if (err != 0) {
498 481 tx->tx_err = err;
499 482 }
500 483 } else {
501 484 /*
502 485 * Access the name so that we'll check for i/o errors to
503 486 * the leaf blocks, etc. We ignore ENOENT, as this name
504 487 * may not yet exist.
505 488 */
506 489 err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
507 490 if (err == EIO || err == ECKSUM || err == ENXIO) {
508 491 tx->tx_err = err;
509 492 }
510 493 }
511 494 }
512 495
513 496 void
514 497 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
515 498 {
516 499 dmu_tx_hold_t *txh;
517 500
518 501 ASSERT0(tx->tx_txg);
519 502
520 503 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
521 504 object, THT_ZAP, add, (uintptr_t)name);
522 505 if (txh != NULL)
523 506 dmu_tx_hold_zap_impl(txh, name);
524 507 }
525 508
526 509 void
527 510 dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
528 511 {
529 512 dmu_tx_hold_t *txh;
530 513
531 514 ASSERT0(tx->tx_txg);
532 515 ASSERT(dn != NULL);
533 516
534 517 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
535 518 if (txh != NULL)
536 519 dmu_tx_hold_zap_impl(txh, name);
537 520 }
538 521
539 522 void
540 523 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
541 524 {
542 525 dmu_tx_hold_t *txh;
543 526
544 527 ASSERT(tx->tx_txg == 0);
545 528
546 529 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
547 530 object, THT_BONUS, 0, 0);
548 531 if (txh)
549 532 dmu_tx_count_dnode(txh);
550 533 }
551 534
552 535 void
553 536 dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
554 537 {
555 538 dmu_tx_hold_t *txh;
556 539
557 540 ASSERT0(tx->tx_txg);
558 541
559 542 txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
560 543 if (txh)
561 544 dmu_tx_count_dnode(txh);
562 545 }
563 546
564 547 void
565 548 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
566 549 {
567 550 dmu_tx_hold_t *txh;
568 551 ASSERT(tx->tx_txg == 0);
569 552
570 553 txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
571 554 DMU_NEW_OBJECT, THT_SPACE, space, 0);
572 555
573 556 (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
574 557 }
575 558
576 559 #ifdef ZFS_DEBUG
577 560 void
578 561 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
579 562 {
580 563 boolean_t match_object = B_FALSE;
581 564 boolean_t match_offset = B_FALSE;
582 565
583 566 DB_DNODE_ENTER(db);
584 567 dnode_t *dn = DB_DNODE(db);
585 568 ASSERT(tx->tx_txg != 0);
586 569 ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
587 570 ASSERT3U(dn->dn_object, ==, db->db.db_object);
588 571
589 572 if (tx->tx_anyobj) {
590 573 DB_DNODE_EXIT(db);
591 574 return;
592 575 }
593 576
594 577 /* XXX No checking on the meta dnode for now */
595 578 if (db->db.db_object == DMU_META_DNODE_OBJECT) {
596 579 DB_DNODE_EXIT(db);
597 580 return;
598 581 }
599 582
600 583 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
601 584 txh = list_next(&tx->tx_holds, txh)) {
602 585 ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
603 586 if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
604 587 match_object = TRUE;
605 588 if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
606 589 int datablkshift = dn->dn_datablkshift ?
607 590 dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
608 591 int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
609 592 int shift = datablkshift + epbs * db->db_level;
610 593 uint64_t beginblk = shift >= 64 ? 0 :
611 594 (txh->txh_arg1 >> shift);
612 595 uint64_t endblk = shift >= 64 ? 0 :
613 596 ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
614 597 uint64_t blkid = db->db_blkid;
615 598
616 599 /* XXX txh_arg2 better not be zero... */
617 600
618 601 dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
619 602 txh->txh_type, beginblk, endblk);
620 603
621 604 switch (txh->txh_type) {
622 605 case THT_WRITE:
623 606 if (blkid >= beginblk && blkid <= endblk)
624 607 match_offset = TRUE;
625 608 /*
626 609 * We will let this hold work for the bonus
627 610 * or spill buffer so that we don't need to
628 611 * hold it when creating a new object.
629 612 */
630 613 if (blkid == DMU_BONUS_BLKID ||
631 614 blkid == DMU_SPILL_BLKID)
632 615 match_offset = TRUE;
633 616 /*
634 617 * They might have to increase nlevels,
635 618 * thus dirtying the new TLIBs. Or the
636 619 * might have to change the block size,
637 620 * thus dirying the new lvl=0 blk=0.
638 621 */
639 622 if (blkid == 0)
640 623 match_offset = TRUE;
641 624 break;
642 625 case THT_FREE:
643 626 /*
644 627 * We will dirty all the level 1 blocks in
645 628 * the free range and perhaps the first and
646 629 * last level 0 block.
647 630 */
648 631 if (blkid >= beginblk && (blkid <= endblk ||
649 632 txh->txh_arg2 == DMU_OBJECT_END))
650 633 match_offset = TRUE;
651 634 break;
652 635 case THT_SPILL:
653 636 if (blkid == DMU_SPILL_BLKID)
654 637 match_offset = TRUE;
655 638 break;
656 639 case THT_BONUS:
657 640 if (blkid == DMU_BONUS_BLKID)
658 641 match_offset = TRUE;
659 642 break;
660 643 case THT_ZAP:
661 644 match_offset = TRUE;
662 645 break;
663 646 case THT_NEWOBJECT:
664 647 match_object = TRUE;
665 648 break;
666 649 default:
667 650 ASSERT(!"bad txh_type");
668 651 }
669 652 }
670 653 if (match_object && match_offset) {
671 654 DB_DNODE_EXIT(db);
672 655 return;
673 656 }
674 657 }
675 658 DB_DNODE_EXIT(db);
676 659 panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
677 660 (u_longlong_t)db->db.db_object, db->db_level,
678 661 (u_longlong_t)db->db_blkid);
679 662 }
680 663 #endif
681 664
682 665 /*
683 666 * If we can't do 10 iops, something is wrong. Let us go ahead
684 667 * and hit zfs_dirty_data_max.
685 668 */
686 669 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
687 670 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
688 671
689 672 /*
690 673 * We delay transactions when we've determined that the backend storage
691 674 * isn't able to accommodate the rate of incoming writes.
692 675 *
693 676 * If there is already a transaction waiting, we delay relative to when
694 677 * that transaction finishes waiting. This way the calculated min_time
695 678 * is independent of the number of threads concurrently executing
696 679 * transactions.
697 680 *
698 681 * If we are the only waiter, wait relative to when the transaction
699 682 * started, rather than the current time. This credits the transaction for
700 683 * "time already served", e.g. reading indirect blocks.
701 684 *
702 685 * The minimum time for a transaction to take is calculated as:
703 686 * min_time = scale * (dirty - min) / (max - dirty)
704 687 * min_time is then capped at zfs_delay_max_ns.
705 688 *
706 689 * The delay has two degrees of freedom that can be adjusted via tunables.
707 690 * The percentage of dirty data at which we start to delay is defined by
708 691 * zfs_delay_min_dirty_percent. This should typically be at or above
709 692 * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
710 693 * delay after writing at full speed has failed to keep up with the incoming
711 694 * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
712 695 * speaking, this variable determines the amount of delay at the midpoint of
713 696 * the curve.
714 697 *
715 698 * delay
716 699 * 10ms +-------------------------------------------------------------*+
717 700 * | *|
718 701 * 9ms + *+
719 702 * | *|
720 703 * 8ms + *+
721 704 * | * |
722 705 * 7ms + * +
723 706 * | * |
724 707 * 6ms + * +
725 708 * | * |
726 709 * 5ms + * +
727 710 * | * |
728 711 * 4ms + * +
729 712 * | * |
730 713 * 3ms + * +
731 714 * | * |
732 715 * 2ms + (midpoint) * +
733 716 * | | ** |
734 717 * 1ms + v *** +
735 718 * | zfs_delay_scale ----------> ******** |
736 719 * 0 +-------------------------------------*********----------------+
737 720 * 0% <- zfs_dirty_data_max -> 100%
738 721 *
739 722 * Note that since the delay is added to the outstanding time remaining on the
740 723 * most recent transaction, the delay is effectively the inverse of IOPS.
741 724 * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
742 725 * was chosen such that small changes in the amount of accumulated dirty data
743 726 * in the first 3/4 of the curve yield relatively small differences in the
744 727 * amount of delay.
745 728 *
746 729 * The effects can be easier to understand when the amount of delay is
747 730 * represented on a log scale:
748 731 *
749 732 * delay
750 733 * 100ms +-------------------------------------------------------------++
751 734 * + +
752 735 * | |
753 736 * + *+
754 737 * 10ms + *+
755 738 * + ** +
756 739 * | (midpoint) ** |
757 740 * + | ** +
758 741 * 1ms + v **** +
759 742 * + zfs_delay_scale ----------> ***** +
760 743 * | **** |
761 744 * + **** +
762 745 * 100us + ** +
763 746 * + * +
764 747 * | * |
765 748 * + * +
766 749 * 10us + * +
767 750 * + +
768 751 * | |
769 752 * + +
770 753 * +--------------------------------------------------------------+
771 754 * 0% <- zfs_dirty_data_max -> 100%
772 755 *
773 756 * Note here that only as the amount of dirty data approaches its limit does
774 757 * the delay start to increase rapidly. The goal of a properly tuned system
775 758 * should be to keep the amount of dirty data out of that range by first
776 759 * ensuring that the appropriate limits are set for the I/O scheduler to reach
777 760 * optimal throughput on the backend storage, and then by changing the value
778 761 * of zfs_delay_scale to increase the steepness of the curve.
779 762 */
780 763 static void
781 764 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
782 765 {
783 766 dsl_pool_t *dp = tx->tx_pool;
784 767 uint64_t delay_min_bytes =
785 768 zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
786 769 hrtime_t wakeup, min_tx_time, now;
787 770
788 771 if (dirty <= delay_min_bytes)
789 772 return;
790 773
791 774 /*
792 775 * The caller has already waited until we are under the max.
793 776 * We make them pass us the amount of dirty data so we don't
794 777 * have to handle the case of it being >= the max, which could
795 778 * cause a divide-by-zero if it's == the max.
796 779 */
797 780 ASSERT3U(dirty, <, zfs_dirty_data_max);
798 781
799 782 now = gethrtime();
800 783 min_tx_time = zfs_delay_scale *
801 784 (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
802 785 if (now > tx->tx_start + min_tx_time)
803 786 return;
804 787
805 788 min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
806 789
807 790 DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
808 791 uint64_t, min_tx_time);
809 792
810 793 mutex_enter(&dp->dp_lock);
811 794 wakeup = MAX(tx->tx_start + min_tx_time,
812 795 dp->dp_last_wakeup + min_tx_time);
813 796 dp->dp_last_wakeup = wakeup;
814 797 mutex_exit(&dp->dp_lock);
815 798
816 799 #ifdef _KERNEL
817 800 mutex_enter(&curthread->t_delay_lock);
818 801 while (cv_timedwait_hires(&curthread->t_delay_cv,
819 802 &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
820 803 CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
821 804 continue;
822 805 mutex_exit(&curthread->t_delay_lock);
823 806 #else
824 807 hrtime_t delta = wakeup - gethrtime();
825 808 struct timespec ts;
826 809 ts.tv_sec = delta / NANOSEC;
827 810 ts.tv_nsec = delta % NANOSEC;
828 811 (void) nanosleep(&ts, NULL);
829 812 #endif
830 813 }
831 814
832 815 /*
833 816 * This routine attempts to assign the transaction to a transaction group.
834 817 * To do so, we must determine if there is sufficient free space on disk.
835 818 *
836 819 * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
837 820 * on it), then it is assumed that there is sufficient free space,
838 821 * unless there's insufficient slop space in the pool (see the comment
839 822 * above spa_slop_shift in spa_misc.c).
840 823 *
841 824 * If it is not a "netfree" transaction, then if the data already on disk
842 825 * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
843 826 * ENOSPC. Otherwise, if the current rough estimate of pending changes,
844 827 * plus the rough estimate of this transaction's changes, may exceed the
845 828 * allowed usage, then this will fail with ERESTART, which will cause the
846 829 * caller to wait for the pending changes to be written to disk (by waiting
847 830 * for the next TXG to open), and then check the space usage again.
848 831 *
849 832 * The rough estimate of pending changes is comprised of the sum of:
850 833 *
851 834 * - this transaction's holds' txh_space_towrite
852 835 *
853 836 * - dd_tempreserved[], which is the sum of in-flight transactions'
854 837 * holds' txh_space_towrite (i.e. those transactions that have called
855 838 * dmu_tx_assign() but not yet called dmu_tx_commit()).
856 839 *
857 840 * - dd_space_towrite[], which is the amount of dirtied dbufs.
858 841 *
859 842 * Note that all of these values are inflated by spa_get_worst_case_asize(),
860 843 * which means that we may get ERESTART well before we are actually in danger
861 844 * of running out of space, but this also mitigates any small inaccuracies
|
↓ open down ↓ |
531 lines elided |
↑ open up ↑ |
862 845 * in the rough estimate (e.g. txh_space_towrite doesn't take into account
863 846 * indirect blocks, and dd_space_towrite[] doesn't take into account changes
864 847 * to the MOS).
865 848 *
866 849 * Note that due to this algorithm, it is possible to exceed the allowed
867 850 * usage by one transaction. Also, as we approach the allowed usage,
868 851 * we will allow a very limited amount of changes into each TXG, thus
869 852 * decreasing performance.
870 853 */
871 854 static int
872 -dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
855 +dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how)
873 856 {
874 857 spa_t *spa = tx->tx_pool->dp_spa;
875 858
876 859 ASSERT0(tx->tx_txg);
877 860
878 861 if (tx->tx_err)
879 862 return (tx->tx_err);
880 863
881 864 if (spa_suspended(spa)) {
882 865 /*
883 866 * If the user has indicated a blocking failure mode
884 867 * then return ERESTART which will block in dmu_tx_wait().
885 868 * Otherwise, return EIO so that an error can get
886 869 * propagated back to the VOP calls.
887 870 *
888 871 * Note that we always honor the txg_how flag regardless
889 872 * of the failuremode setting.
890 873 */
891 874 if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
892 - !(txg_how & TXG_WAIT))
875 + txg_how != TXG_WAIT)
893 876 return (SET_ERROR(EIO));
894 877
895 878 return (SET_ERROR(ERESTART));
896 879 }
897 880
898 - if (!tx->tx_dirty_delayed &&
881 + if (!tx->tx_waited &&
899 882 dsl_pool_need_dirty_delay(tx->tx_pool)) {
900 883 tx->tx_wait_dirty = B_TRUE;
901 884 return (SET_ERROR(ERESTART));
902 885 }
903 886
904 887 tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
905 888 tx->tx_needassign_txh = NULL;
906 889
907 890 /*
908 891 * NB: No error returns are allowed after txg_hold_open, but
909 892 * before processing the dnode holds, due to the
910 893 * dmu_tx_unassign() logic.
911 894 */
912 895
913 896 uint64_t towrite = 0;
914 897 uint64_t tohold = 0;
915 898 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
916 899 txh = list_next(&tx->tx_holds, txh)) {
917 900 dnode_t *dn = txh->txh_dnode;
918 901 if (dn != NULL) {
919 902 mutex_enter(&dn->dn_mtx);
920 903 if (dn->dn_assigned_txg == tx->tx_txg - 1) {
921 904 mutex_exit(&dn->dn_mtx);
922 905 tx->tx_needassign_txh = txh;
923 906 return (SET_ERROR(ERESTART));
924 907 }
925 908 if (dn->dn_assigned_txg == 0)
926 909 dn->dn_assigned_txg = tx->tx_txg;
927 910 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
928 911 (void) refcount_add(&dn->dn_tx_holds, tx);
929 912 mutex_exit(&dn->dn_mtx);
930 913 }
931 914 towrite += refcount_count(&txh->txh_space_towrite);
932 915 tohold += refcount_count(&txh->txh_memory_tohold);
933 916 }
934 917
935 918 /* needed allocation: worst-case estimate of write space */
936 919 uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
937 920 /* calculate memory footprint estimate */
938 921 uint64_t memory = towrite + tohold;
939 922
940 923 if (tx->tx_dir != NULL && asize != 0) {
941 924 int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
942 925 asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
943 926 if (err != 0)
944 927 return (err);
945 928 }
946 929
947 930 return (0);
948 931 }
949 932
950 933 static void
951 934 dmu_tx_unassign(dmu_tx_t *tx)
952 935 {
953 936 if (tx->tx_txg == 0)
954 937 return;
955 938
956 939 txg_rele_to_quiesce(&tx->tx_txgh);
957 940
958 941 /*
959 942 * Walk the transaction's hold list, removing the hold on the
960 943 * associated dnode, and notifying waiters if the refcount drops to 0.
961 944 */
962 945 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
963 946 txh != tx->tx_needassign_txh;
964 947 txh = list_next(&tx->tx_holds, txh)) {
965 948 dnode_t *dn = txh->txh_dnode;
966 949
967 950 if (dn == NULL)
968 951 continue;
969 952 mutex_enter(&dn->dn_mtx);
970 953 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
971 954
972 955 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
973 956 dn->dn_assigned_txg = 0;
974 957 cv_broadcast(&dn->dn_notxholds);
975 958 }
|
↓ open down ↓ |
67 lines elided |
↑ open up ↑ |
976 959 mutex_exit(&dn->dn_mtx);
977 960 }
978 961
979 962 txg_rele_to_sync(&tx->tx_txgh);
980 963
981 964 tx->tx_lasttried_txg = tx->tx_txg;
982 965 tx->tx_txg = 0;
983 966 }
984 967
985 968 /*
986 - * Assign tx to a transaction group; txg_how is a bitmask:
969 + * Assign tx to a transaction group. txg_how can be one of:
987 970 *
988 - * If TXG_WAIT is set and the currently open txg is full, this function
989 - * will wait until there's a new txg. This should be used when no locks
990 - * are being held. With this bit set, this function will only fail if
991 - * we're truly out of space (or over quota).
971 + * (1) TXG_WAIT. If the current open txg is full, waits until there's
972 + * a new one. This should be used when you're not holding locks.
973 + * It will only fail if we're truly out of space (or over quota).
992 974 *
993 - * If TXG_WAIT is *not* set and we can't assign into the currently open
994 - * txg without blocking, this function will return immediately with
995 - * ERESTART. This should be used whenever locks are being held. On an
996 - * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
997 - * and try again.
975 + * (2) TXG_NOWAIT. If we can't assign into the current open txg without
976 + * blocking, returns immediately with ERESTART. This should be used
977 + * whenever you're holding locks. On an ERESTART error, the caller
978 + * should drop locks, do a dmu_tx_wait(tx), and try again.
998 979 *
999 - * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
1000 - * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
1001 - * details on the throttle). This is used by the VFS operations, after
1002 - * they have already called dmu_tx_wait() (though most likely on a
1003 - * different tx).
980 + * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait()
981 + * has already been called on behalf of this operation (though
982 + * most likely on a different tx).
1004 983 */
1005 984 int
1006 -dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
985 +dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how)
1007 986 {
1008 987 int err;
1009 988
1010 989 ASSERT(tx->tx_txg == 0);
1011 - ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
990 + ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT ||
991 + txg_how == TXG_WAITED);
1012 992 ASSERT(!dsl_pool_sync_context(tx->tx_pool));
1013 993
1014 994 /* If we might wait, we must not hold the config lock. */
1015 - IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
995 + ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool));
1016 996
1017 - if ((txg_how & TXG_NOTHROTTLE))
1018 - tx->tx_dirty_delayed = B_TRUE;
997 + if (txg_how == TXG_WAITED)
998 + tx->tx_waited = B_TRUE;
1019 999
1020 1000 while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
1021 1001 dmu_tx_unassign(tx);
1022 1002
1023 - if (err != ERESTART || !(txg_how & TXG_WAIT))
1003 + if (err != ERESTART || txg_how != TXG_WAIT)
1024 1004 return (err);
1025 1005
1026 1006 dmu_tx_wait(tx);
1027 1007 }
1028 1008
1029 1009 txg_rele_to_quiesce(&tx->tx_txgh);
1030 1010
1031 1011 return (0);
1032 1012 }
1033 1013
1034 1014 void
1035 1015 dmu_tx_wait(dmu_tx_t *tx)
1036 1016 {
1037 1017 spa_t *spa = tx->tx_pool->dp_spa;
1038 1018 dsl_pool_t *dp = tx->tx_pool;
1039 1019
1040 1020 ASSERT(tx->tx_txg == 0);
1041 1021 ASSERT(!dsl_pool_config_held(tx->tx_pool));
1042 1022
1043 1023 if (tx->tx_wait_dirty) {
1044 1024 /*
1045 1025 * dmu_tx_try_assign() has determined that we need to wait
1046 1026 * because we've consumed much or all of the dirty buffer
1047 1027 * space.
1048 1028 */
1049 1029 mutex_enter(&dp->dp_lock);
|
↓ open down ↓ |
16 lines elided |
↑ open up ↑ |
1050 1030 while (dp->dp_dirty_total >= zfs_dirty_data_max)
1051 1031 cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
1052 1032 uint64_t dirty = dp->dp_dirty_total;
1053 1033 mutex_exit(&dp->dp_lock);
1054 1034
1055 1035 dmu_tx_delay(tx, dirty);
1056 1036
1057 1037 tx->tx_wait_dirty = B_FALSE;
1058 1038
1059 1039 /*
1060 - * Note: setting tx_dirty_delayed only has effect if the
1061 - * caller used TX_WAIT. Otherwise they are going to
1062 - * destroy this tx and try again. The common case,
1063 - * zfs_write(), uses TX_WAIT.
1040 + * Note: setting tx_waited only has effect if the caller
1041 + * used TX_WAIT. Otherwise they are going to destroy
1042 + * this tx and try again. The common case, zfs_write(),
1043 + * uses TX_WAIT.
1064 1044 */
1065 - tx->tx_dirty_delayed = B_TRUE;
1045 + tx->tx_waited = B_TRUE;
1066 1046 } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
1067 1047 /*
1068 1048 * If the pool is suspended we need to wait until it
1069 1049 * is resumed. Note that it's possible that the pool
1070 1050 * has become active after this thread has tried to
1071 1051 * obtain a tx. If that's the case then tx_lasttried_txg
1072 1052 * would not have been set.
1073 1053 */
1074 1054 txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1075 1055 } else if (tx->tx_needassign_txh) {
1076 1056 /*
1077 1057 * A dnode is assigned to the quiescing txg. Wait for its
|
↓ open down ↓ |
2 lines elided |
↑ open up ↑ |
1078 1058 * transaction to complete.
1079 1059 */
1080 1060 dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
1081 1061
1082 1062 mutex_enter(&dn->dn_mtx);
1083 1063 while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
1084 1064 cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
1085 1065 mutex_exit(&dn->dn_mtx);
1086 1066 tx->tx_needassign_txh = NULL;
1087 1067 } else {
1088 - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
1068 + /*
1069 + * If we have a lot of dirty data just wait until we sync
1070 + * out a TXG at which point we'll hopefully have synced
1071 + * a portion of the changes.
1072 + */
1073 + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
1089 1074 }
1090 1075 }
1091 1076
1092 1077 static void
1093 1078 dmu_tx_destroy(dmu_tx_t *tx)
1094 1079 {
1095 1080 dmu_tx_hold_t *txh;
1096 1081
1097 1082 while ((txh = list_head(&tx->tx_holds)) != NULL) {
1098 1083 dnode_t *dn = txh->txh_dnode;
1099 1084
1100 1085 list_remove(&tx->tx_holds, txh);
1101 1086 refcount_destroy_many(&txh->txh_space_towrite,
1102 1087 refcount_count(&txh->txh_space_towrite));
1103 1088 refcount_destroy_many(&txh->txh_memory_tohold,
1104 1089 refcount_count(&txh->txh_memory_tohold));
1105 1090 kmem_free(txh, sizeof (dmu_tx_hold_t));
1106 1091 if (dn != NULL)
1107 1092 dnode_rele(dn, tx);
1108 1093 }
1109 1094
1110 1095 list_destroy(&tx->tx_callbacks);
1111 1096 list_destroy(&tx->tx_holds);
1112 1097 kmem_free(tx, sizeof (dmu_tx_t));
1113 1098 }
1114 1099
1115 1100 void
1116 1101 dmu_tx_commit(dmu_tx_t *tx)
1117 1102 {
1118 1103 ASSERT(tx->tx_txg != 0);
1119 1104
1120 1105 /*
1121 1106 * Go through the transaction's hold list and remove holds on
1122 1107 * associated dnodes, notifying waiters if no holds remain.
1123 1108 */
1124 1109 for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
1125 1110 txh = list_next(&tx->tx_holds, txh)) {
1126 1111 dnode_t *dn = txh->txh_dnode;
1127 1112
1128 1113 if (dn == NULL)
1129 1114 continue;
1130 1115
1131 1116 mutex_enter(&dn->dn_mtx);
1132 1117 ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
1133 1118
|
↓ open down ↓ |
35 lines elided |
↑ open up ↑ |
1134 1119 if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
1135 1120 dn->dn_assigned_txg = 0;
1136 1121 cv_broadcast(&dn->dn_notxholds);
1137 1122 }
1138 1123 mutex_exit(&dn->dn_mtx);
1139 1124 }
1140 1125
1141 1126 if (tx->tx_tempreserve_cookie)
1142 1127 dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
1143 1128
1144 - if (!list_is_empty(&tx->tx_callbacks))
1145 - txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
1129 + if (!list_is_empty(&tx->tx_callbacks)) {
1130 + if (dmu_tx_is_syncing(tx)) {
1131 + txg_register_callbacks_sync(tx->tx_pool,
1132 + tx->tx_txg, &tx->tx_callbacks);
1133 + } else {
1134 + txg_register_callbacks(&tx->tx_txgh,
1135 + &tx->tx_callbacks);
1136 + }
1137 + }
1146 1138
1147 1139 if (tx->tx_anyobj == FALSE)
1148 1140 txg_rele_to_sync(&tx->tx_txgh);
1149 1141
1150 1142 dmu_tx_destroy(tx);
1151 1143 }
1152 1144
1153 1145 void
1154 1146 dmu_tx_abort(dmu_tx_t *tx)
1155 1147 {
1156 1148 ASSERT(tx->tx_txg == 0);
1157 1149
1158 1150 /*
1159 1151 * Call any registered callbacks with an error code.
1160 1152 */
1161 1153 if (!list_is_empty(&tx->tx_callbacks))
1162 1154 dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
1163 1155
1164 1156 dmu_tx_destroy(tx);
1165 1157 }
1166 1158
1167 1159 uint64_t
1168 1160 dmu_tx_get_txg(dmu_tx_t *tx)
1169 1161 {
1170 1162 ASSERT(tx->tx_txg != 0);
1171 1163 return (tx->tx_txg);
1172 1164 }
1173 1165
1174 1166 dsl_pool_t *
1175 1167 dmu_tx_pool(dmu_tx_t *tx)
1176 1168 {
1177 1169 ASSERT(tx->tx_pool != NULL);
1178 1170 return (tx->tx_pool);
1179 1171 }
1180 1172
1181 1173 void
1182 1174 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
1183 1175 {
1184 1176 dmu_tx_callback_t *dcb;
1185 1177
1186 1178 dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
1187 1179
1188 1180 dcb->dcb_func = func;
1189 1181 dcb->dcb_data = data;
1190 1182
1191 1183 list_insert_tail(&tx->tx_callbacks, dcb);
1192 1184 }
1193 1185
1194 1186 /*
1195 1187 * Call all the commit callbacks on a list, with a given error code.
1196 1188 */
1197 1189 void
1198 1190 dmu_tx_do_callbacks(list_t *cb_list, int error)
1199 1191 {
1200 1192 dmu_tx_callback_t *dcb;
1201 1193
1202 1194 while ((dcb = list_head(cb_list)) != NULL) {
1203 1195 list_remove(cb_list, dcb);
1204 1196 dcb->dcb_func(dcb->dcb_data, error);
1205 1197 kmem_free(dcb, sizeof (dmu_tx_callback_t));
1206 1198 }
1207 1199 }
1208 1200
1209 1201 /*
1210 1202 * Interface to hold a bunch of attributes.
1211 1203 * used for creating new files.
1212 1204 * attrsize is the total size of all attributes
1213 1205 * to be added during object creation
1214 1206 *
1215 1207 * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
1216 1208 */
1217 1209
1218 1210 /*
1219 1211 * hold necessary attribute name for attribute registration.
1220 1212 * should be a very rare case where this is needed. If it does
1221 1213 * happen it would only happen on the first write to the file system.
1222 1214 */
1223 1215 static void
1224 1216 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
1225 1217 {
1226 1218 if (!sa->sa_need_attr_registration)
1227 1219 return;
1228 1220
1229 1221 for (int i = 0; i != sa->sa_num_attrs; i++) {
1230 1222 if (!sa->sa_attr_table[i].sa_registered) {
1231 1223 if (sa->sa_reg_attr_obj)
1232 1224 dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
1233 1225 B_TRUE, sa->sa_attr_table[i].sa_name);
1234 1226 else
1235 1227 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
1236 1228 B_TRUE, sa->sa_attr_table[i].sa_name);
1237 1229 }
1238 1230 }
1239 1231 }
1240 1232
1241 1233 void
1242 1234 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
1243 1235 {
1244 1236 dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
1245 1237 tx->tx_objset, object, THT_SPILL, 0, 0);
1246 1238
1247 1239 (void) refcount_add_many(&txh->txh_space_towrite,
1248 1240 SPA_OLD_MAXBLOCKSIZE, FTAG);
1249 1241 }
1250 1242
1251 1243 void
1252 1244 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
1253 1245 {
1254 1246 sa_os_t *sa = tx->tx_objset->os_sa;
1255 1247
1256 1248 dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
1257 1249
1258 1250 if (tx->tx_objset->os_sa->sa_master_obj == 0)
1259 1251 return;
1260 1252
1261 1253 if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
1262 1254 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1263 1255 } else {
1264 1256 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1265 1257 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1266 1258 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1267 1259 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1268 1260 }
1269 1261
1270 1262 dmu_tx_sa_registration_hold(sa, tx);
1271 1263
1272 1264 if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
1273 1265 return;
1274 1266
1275 1267 (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
1276 1268 THT_SPILL, 0, 0);
1277 1269 }
1278 1270
1279 1271 /*
1280 1272 * Hold SA attribute
1281 1273 *
1282 1274 * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
1283 1275 *
1284 1276 * variable_size is the total size of all variable sized attributes
1285 1277 * passed to this function. It is not the total size of all
1286 1278 * variable size attributes that *may* exist on this object.
1287 1279 */
1288 1280 void
1289 1281 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
1290 1282 {
1291 1283 uint64_t object;
1292 1284 sa_os_t *sa = tx->tx_objset->os_sa;
1293 1285
1294 1286 ASSERT(hdl != NULL);
1295 1287
1296 1288 object = sa_handle_object(hdl);
1297 1289
1298 1290 dmu_tx_hold_bonus(tx, object);
1299 1291
1300 1292 if (tx->tx_objset->os_sa->sa_master_obj == 0)
1301 1293 return;
1302 1294
1303 1295 if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
1304 1296 tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
1305 1297 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
1306 1298 dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
1307 1299 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1308 1300 dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
1309 1301 }
1310 1302
1311 1303 dmu_tx_sa_registration_hold(sa, tx);
1312 1304
1313 1305 if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
1314 1306 dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
1315 1307
1316 1308 if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
1317 1309 ASSERT(tx->tx_txg == 0);
1318 1310 dmu_tx_hold_spill(tx, object);
1319 1311 } else {
1320 1312 dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
1321 1313 dnode_t *dn;
1322 1314
1323 1315 DB_DNODE_ENTER(db);
1324 1316 dn = DB_DNODE(db);
1325 1317 if (dn->dn_have_spill) {
1326 1318 ASSERT(tx->tx_txg == 0);
1327 1319 dmu_tx_hold_spill(tx, object);
1328 1320 }
1329 1321 DB_DNODE_EXIT(db);
1330 1322 }
1331 1323 }
|
↓ open down ↓ |
176 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX