Print this page
2619 asynchronous destruction of ZFS file systems
2747 SPA versioning with zfs feature flags
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <gwilson@delphix.com>
Reviewed by: Richard Lowe <richlowe@richlowe.net>
Reviewed by: Dan Kruchinin <dan.kruchinin@gmail.com>
Approved by: Dan McDonald <danmcd@nexenta.com>
| Split |
Close |
| Expand all |
| Collapse all |
--- old/usr/src/uts/common/fs/zfs/dmu.c
+++ new/usr/src/uts/common/fs/zfs/dmu.c
1 1 /*
2 2 * CDDL HEADER START
3 3 *
4 4 * The contents of this file are subject to the terms of the
5 5 * Common Development and Distribution License (the "License").
6 6 * You may not use this file except in compliance with the License.
7 7 *
8 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 9 * or http://www.opensolaris.org/os/licensing.
10 10 * See the License for the specific language governing permissions
11 11 * and limitations under the License.
12 12 *
|
↓ open down ↓ |
12 lines elided |
↑ open up ↑ |
13 13 * When distributing Covered Code, include this CDDL HEADER in each
14 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 15 * If applicable, add the following below this CDDL HEADER, with the
16 16 * fields enclosed by brackets "[]" replaced with your own identifying
17 17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 18 *
19 19 * CDDL HEADER END
20 20 */
21 21 /*
22 22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 + * Copyright (c) 2012 by Delphix. All rights reserved.
23 24 */
24 25
25 26 #include <sys/dmu.h>
26 27 #include <sys/dmu_impl.h>
27 28 #include <sys/dmu_tx.h>
28 29 #include <sys/dbuf.h>
29 30 #include <sys/dnode.h>
30 31 #include <sys/zfs_context.h>
31 32 #include <sys/dmu_objset.h>
32 33 #include <sys/dmu_traverse.h>
33 34 #include <sys/dsl_dataset.h>
34 35 #include <sys/dsl_dir.h>
35 36 #include <sys/dsl_pool.h>
36 37 #include <sys/dsl_synctask.h>
37 38 #include <sys/dsl_prop.h>
38 39 #include <sys/dmu_zfetch.h>
|
↓ open down ↓ |
6 lines elided |
↑ open up ↑ |
39 40 #include <sys/zfs_ioctl.h>
40 41 #include <sys/zap.h>
41 42 #include <sys/zio_checksum.h>
42 43 #include <sys/sa.h>
43 44 #ifdef _KERNEL
44 45 #include <sys/vmsystm.h>
45 46 #include <sys/zfs_znode.h>
46 47 #endif
47 48
48 49 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
49 - { byteswap_uint8_array, TRUE, "unallocated" },
50 - { zap_byteswap, TRUE, "object directory" },
51 - { byteswap_uint64_array, TRUE, "object array" },
52 - { byteswap_uint8_array, TRUE, "packed nvlist" },
53 - { byteswap_uint64_array, TRUE, "packed nvlist size" },
54 - { byteswap_uint64_array, TRUE, "bpobj" },
55 - { byteswap_uint64_array, TRUE, "bpobj header" },
56 - { byteswap_uint64_array, TRUE, "SPA space map header" },
57 - { byteswap_uint64_array, TRUE, "SPA space map" },
58 - { byteswap_uint64_array, TRUE, "ZIL intent log" },
59 - { dnode_buf_byteswap, TRUE, "DMU dnode" },
60 - { dmu_objset_byteswap, TRUE, "DMU objset" },
61 - { byteswap_uint64_array, TRUE, "DSL directory" },
62 - { zap_byteswap, TRUE, "DSL directory child map"},
63 - { zap_byteswap, TRUE, "DSL dataset snap map" },
64 - { zap_byteswap, TRUE, "DSL props" },
65 - { byteswap_uint64_array, TRUE, "DSL dataset" },
66 - { zfs_znode_byteswap, TRUE, "ZFS znode" },
67 - { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" },
68 - { byteswap_uint8_array, FALSE, "ZFS plain file" },
69 - { zap_byteswap, TRUE, "ZFS directory" },
70 - { zap_byteswap, TRUE, "ZFS master node" },
71 - { zap_byteswap, TRUE, "ZFS delete queue" },
72 - { byteswap_uint8_array, FALSE, "zvol object" },
73 - { zap_byteswap, TRUE, "zvol prop" },
74 - { byteswap_uint8_array, FALSE, "other uint8[]" },
75 - { byteswap_uint64_array, FALSE, "other uint64[]" },
76 - { zap_byteswap, TRUE, "other ZAP" },
77 - { zap_byteswap, TRUE, "persistent error log" },
78 - { byteswap_uint8_array, TRUE, "SPA history" },
79 - { byteswap_uint64_array, TRUE, "SPA history offsets" },
80 - { zap_byteswap, TRUE, "Pool properties" },
81 - { zap_byteswap, TRUE, "DSL permissions" },
82 - { zfs_acl_byteswap, TRUE, "ZFS ACL" },
83 - { byteswap_uint8_array, TRUE, "ZFS SYSACL" },
84 - { byteswap_uint8_array, TRUE, "FUID table" },
85 - { byteswap_uint64_array, TRUE, "FUID table size" },
86 - { zap_byteswap, TRUE, "DSL dataset next clones"},
87 - { zap_byteswap, TRUE, "scan work queue" },
88 - { zap_byteswap, TRUE, "ZFS user/group used" },
89 - { zap_byteswap, TRUE, "ZFS user/group quota" },
90 - { zap_byteswap, TRUE, "snapshot refcount tags"},
91 - { zap_byteswap, TRUE, "DDT ZAP algorithm" },
92 - { zap_byteswap, TRUE, "DDT statistics" },
93 - { byteswap_uint8_array, TRUE, "System attributes" },
94 - { zap_byteswap, TRUE, "SA master node" },
95 - { zap_byteswap, TRUE, "SA attr registration" },
96 - { zap_byteswap, TRUE, "SA attr layouts" },
97 - { zap_byteswap, TRUE, "scan translations" },
98 - { byteswap_uint8_array, FALSE, "deduplicated block" },
99 - { zap_byteswap, TRUE, "DSL deadlist map" },
100 - { byteswap_uint64_array, TRUE, "DSL deadlist map hdr" },
101 - { zap_byteswap, TRUE, "DSL dir clones" },
102 - { byteswap_uint64_array, TRUE, "bpobj subobj" },
50 + { DMU_BSWAP_UINT8, TRUE, "unallocated" },
51 + { DMU_BSWAP_ZAP, TRUE, "object directory" },
52 + { DMU_BSWAP_UINT64, TRUE, "object array" },
53 + { DMU_BSWAP_UINT8, TRUE, "packed nvlist" },
54 + { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" },
55 + { DMU_BSWAP_UINT64, TRUE, "bpobj" },
56 + { DMU_BSWAP_UINT64, TRUE, "bpobj header" },
57 + { DMU_BSWAP_UINT64, TRUE, "SPA space map header" },
58 + { DMU_BSWAP_UINT64, TRUE, "SPA space map" },
59 + { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" },
60 + { DMU_BSWAP_DNODE, TRUE, "DMU dnode" },
61 + { DMU_BSWAP_OBJSET, TRUE, "DMU objset" },
62 + { DMU_BSWAP_UINT64, TRUE, "DSL directory" },
63 + { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"},
64 + { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" },
65 + { DMU_BSWAP_ZAP, TRUE, "DSL props" },
66 + { DMU_BSWAP_UINT64, TRUE, "DSL dataset" },
67 + { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" },
68 + { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" },
69 + { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" },
70 + { DMU_BSWAP_ZAP, TRUE, "ZFS directory" },
71 + { DMU_BSWAP_ZAP, TRUE, "ZFS master node" },
72 + { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" },
73 + { DMU_BSWAP_UINT8, FALSE, "zvol object" },
74 + { DMU_BSWAP_ZAP, TRUE, "zvol prop" },
75 + { DMU_BSWAP_UINT8, FALSE, "other uint8[]" },
76 + { DMU_BSWAP_UINT64, FALSE, "other uint64[]" },
77 + { DMU_BSWAP_ZAP, TRUE, "other ZAP" },
78 + { DMU_BSWAP_ZAP, TRUE, "persistent error log" },
79 + { DMU_BSWAP_UINT8, TRUE, "SPA history" },
80 + { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" },
81 + { DMU_BSWAP_ZAP, TRUE, "Pool properties" },
82 + { DMU_BSWAP_ZAP, TRUE, "DSL permissions" },
83 + { DMU_BSWAP_ACL, TRUE, "ZFS ACL" },
84 + { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" },
85 + { DMU_BSWAP_UINT8, TRUE, "FUID table" },
86 + { DMU_BSWAP_UINT64, TRUE, "FUID table size" },
87 + { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"},
88 + { DMU_BSWAP_ZAP, TRUE, "scan work queue" },
89 + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" },
90 + { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" },
91 + { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"},
92 + { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" },
93 + { DMU_BSWAP_ZAP, TRUE, "DDT statistics" },
94 + { DMU_BSWAP_UINT8, TRUE, "System attributes" },
95 + { DMU_BSWAP_ZAP, TRUE, "SA master node" },
96 + { DMU_BSWAP_ZAP, TRUE, "SA attr registration" },
97 + { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" },
98 + { DMU_BSWAP_ZAP, TRUE, "scan translations" },
99 + { DMU_BSWAP_UINT8, FALSE, "deduplicated block" },
100 + { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" },
101 + { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" },
102 + { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" },
103 + { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" }
103 104 };
104 105
106 +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
107 + { byteswap_uint8_array, "uint8" },
108 + { byteswap_uint16_array, "uint16" },
109 + { byteswap_uint32_array, "uint32" },
110 + { byteswap_uint64_array, "uint64" },
111 + { zap_byteswap, "zap" },
112 + { dnode_buf_byteswap, "dnode" },
113 + { dmu_objset_byteswap, "objset" },
114 + { zfs_znode_byteswap, "znode" },
115 + { zfs_oldacl_byteswap, "oldacl" },
116 + { zfs_acl_byteswap, "acl" }
117 +};
118 +
105 119 int
106 120 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
107 121 void *tag, dmu_buf_t **dbp, int flags)
108 122 {
109 123 dnode_t *dn;
110 124 uint64_t blkid;
111 125 dmu_buf_impl_t *db;
112 126 int err;
113 127 int db_flags = DB_RF_CANFAIL;
114 128
115 129 if (flags & DMU_READ_NO_PREFETCH)
116 130 db_flags |= DB_RF_NOPREFETCH;
117 131
118 132 err = dnode_hold(os, object, FTAG, &dn);
119 133 if (err)
120 134 return (err);
121 135 blkid = dbuf_whichblock(dn, offset);
122 136 rw_enter(&dn->dn_struct_rwlock, RW_READER);
123 137 db = dbuf_hold(dn, blkid, tag);
124 138 rw_exit(&dn->dn_struct_rwlock);
125 139 if (db == NULL) {
126 140 err = EIO;
127 141 } else {
128 142 err = dbuf_read(db, NULL, db_flags);
129 143 if (err) {
130 144 dbuf_rele(db, tag);
131 145 db = NULL;
132 146 }
133 147 }
134 148
135 149 dnode_rele(dn, FTAG);
136 150 *dbp = &db->db; /* NULL db plus first field offset is NULL */
137 151 return (err);
138 152 }
139 153
140 154 int
141 155 dmu_bonus_max(void)
142 156 {
143 157 return (DN_MAX_BONUSLEN);
144 158 }
145 159
146 160 int
147 161 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
148 162 {
149 163 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
150 164 dnode_t *dn;
151 165 int error;
152 166
153 167 DB_DNODE_ENTER(db);
154 168 dn = DB_DNODE(db);
155 169
156 170 if (dn->dn_bonus != db) {
157 171 error = EINVAL;
158 172 } else if (newsize < 0 || newsize > db_fake->db_size) {
159 173 error = EINVAL;
160 174 } else {
161 175 dnode_setbonuslen(dn, newsize, tx);
162 176 error = 0;
163 177 }
164 178
165 179 DB_DNODE_EXIT(db);
166 180 return (error);
167 181 }
168 182
|
↓ open down ↓ |
54 lines elided |
↑ open up ↑ |
169 183 int
170 184 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
171 185 {
172 186 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
173 187 dnode_t *dn;
174 188 int error;
175 189
176 190 DB_DNODE_ENTER(db);
177 191 dn = DB_DNODE(db);
178 192
179 - if (type > DMU_OT_NUMTYPES) {
193 + if (!DMU_OT_IS_VALID(type)) {
180 194 error = EINVAL;
181 195 } else if (dn->dn_bonus != db) {
182 196 error = EINVAL;
183 197 } else {
184 198 dnode_setbonus_type(dn, type, tx);
185 199 error = 0;
186 200 }
187 201
188 202 DB_DNODE_EXIT(db);
189 203 return (error);
190 204 }
191 205
192 206 dmu_object_type_t
193 207 dmu_get_bonustype(dmu_buf_t *db_fake)
194 208 {
195 209 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
196 210 dnode_t *dn;
197 211 dmu_object_type_t type;
198 212
199 213 DB_DNODE_ENTER(db);
200 214 dn = DB_DNODE(db);
201 215 type = dn->dn_bonustype;
202 216 DB_DNODE_EXIT(db);
203 217
204 218 return (type);
205 219 }
206 220
207 221 int
208 222 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
209 223 {
210 224 dnode_t *dn;
211 225 int error;
212 226
213 227 error = dnode_hold(os, object, FTAG, &dn);
214 228 dbuf_rm_spill(dn, tx);
215 229 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
216 230 dnode_rm_spill(dn, tx);
217 231 rw_exit(&dn->dn_struct_rwlock);
218 232 dnode_rele(dn, FTAG);
219 233 return (error);
220 234 }
221 235
222 236 /*
223 237 * returns ENOENT, EIO, or 0.
224 238 */
225 239 int
226 240 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
227 241 {
228 242 dnode_t *dn;
229 243 dmu_buf_impl_t *db;
230 244 int error;
231 245
232 246 error = dnode_hold(os, object, FTAG, &dn);
233 247 if (error)
234 248 return (error);
235 249
236 250 rw_enter(&dn->dn_struct_rwlock, RW_READER);
237 251 if (dn->dn_bonus == NULL) {
238 252 rw_exit(&dn->dn_struct_rwlock);
239 253 rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
240 254 if (dn->dn_bonus == NULL)
241 255 dbuf_create_bonus(dn);
242 256 }
243 257 db = dn->dn_bonus;
244 258
245 259 /* as long as the bonus buf is held, the dnode will be held */
246 260 if (refcount_add(&db->db_holds, tag) == 1) {
247 261 VERIFY(dnode_add_ref(dn, db));
248 262 (void) atomic_inc_32_nv(&dn->dn_dbufs_count);
249 263 }
250 264
251 265 /*
252 266 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
253 267 * hold and incrementing the dbuf count to ensure that dnode_move() sees
254 268 * a dnode hold for every dbuf.
255 269 */
256 270 rw_exit(&dn->dn_struct_rwlock);
257 271
258 272 dnode_rele(dn, FTAG);
259 273
260 274 VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
261 275
262 276 *dbp = &db->db;
263 277 return (0);
264 278 }
265 279
266 280 /*
267 281 * returns ENOENT, EIO, or 0.
268 282 *
269 283 * This interface will allocate a blank spill dbuf when a spill blk
270 284 * doesn't already exist on the dnode.
271 285 *
272 286 * if you only want to find an already existing spill db, then
273 287 * dmu_spill_hold_existing() should be used.
274 288 */
275 289 int
276 290 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
277 291 {
278 292 dmu_buf_impl_t *db = NULL;
279 293 int err;
280 294
281 295 if ((flags & DB_RF_HAVESTRUCT) == 0)
282 296 rw_enter(&dn->dn_struct_rwlock, RW_READER);
283 297
284 298 db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
285 299
286 300 if ((flags & DB_RF_HAVESTRUCT) == 0)
287 301 rw_exit(&dn->dn_struct_rwlock);
288 302
289 303 ASSERT(db != NULL);
290 304 err = dbuf_read(db, NULL, flags);
291 305 if (err == 0)
292 306 *dbp = &db->db;
293 307 else
294 308 dbuf_rele(db, tag);
295 309 return (err);
296 310 }
297 311
298 312 int
299 313 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
300 314 {
301 315 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
302 316 dnode_t *dn;
303 317 int err;
304 318
305 319 DB_DNODE_ENTER(db);
306 320 dn = DB_DNODE(db);
307 321
308 322 if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
309 323 err = EINVAL;
310 324 } else {
311 325 rw_enter(&dn->dn_struct_rwlock, RW_READER);
312 326
313 327 if (!dn->dn_have_spill) {
314 328 err = ENOENT;
315 329 } else {
316 330 err = dmu_spill_hold_by_dnode(dn,
317 331 DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
318 332 }
319 333
320 334 rw_exit(&dn->dn_struct_rwlock);
321 335 }
322 336
323 337 DB_DNODE_EXIT(db);
324 338 return (err);
325 339 }
326 340
327 341 int
328 342 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
329 343 {
330 344 dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
331 345 dnode_t *dn;
332 346 int err;
333 347
334 348 DB_DNODE_ENTER(db);
335 349 dn = DB_DNODE(db);
336 350 err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
337 351 DB_DNODE_EXIT(db);
338 352
339 353 return (err);
340 354 }
341 355
342 356 /*
343 357 * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
344 358 * to take a held dnode rather than <os, object> -- the lookup is wasteful,
345 359 * and can induce severe lock contention when writing to several files
346 360 * whose dnodes are in the same block.
347 361 */
348 362 static int
349 363 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
350 364 int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
351 365 {
352 366 dsl_pool_t *dp = NULL;
353 367 dmu_buf_t **dbp;
354 368 uint64_t blkid, nblks, i;
355 369 uint32_t dbuf_flags;
356 370 int err;
357 371 zio_t *zio;
358 372 hrtime_t start;
359 373
360 374 ASSERT(length <= DMU_MAX_ACCESS);
361 375
362 376 dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
363 377 if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
364 378 dbuf_flags |= DB_RF_NOPREFETCH;
365 379
366 380 rw_enter(&dn->dn_struct_rwlock, RW_READER);
367 381 if (dn->dn_datablkshift) {
368 382 int blkshift = dn->dn_datablkshift;
369 383 nblks = (P2ROUNDUP(offset+length, 1ULL<<blkshift) -
370 384 P2ALIGN(offset, 1ULL<<blkshift)) >> blkshift;
371 385 } else {
372 386 if (offset + length > dn->dn_datablksz) {
373 387 zfs_panic_recover("zfs: accessing past end of object "
374 388 "%llx/%llx (size=%u access=%llu+%llu)",
375 389 (longlong_t)dn->dn_objset->
376 390 os_dsl_dataset->ds_object,
377 391 (longlong_t)dn->dn_object, dn->dn_datablksz,
378 392 (longlong_t)offset, (longlong_t)length);
379 393 rw_exit(&dn->dn_struct_rwlock);
380 394 return (EIO);
381 395 }
382 396 nblks = 1;
383 397 }
384 398 dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
385 399
386 400 if (dn->dn_objset->os_dsl_dataset)
387 401 dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
388 402 if (dp && dsl_pool_sync_context(dp))
389 403 start = gethrtime();
390 404 zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
391 405 blkid = dbuf_whichblock(dn, offset);
392 406 for (i = 0; i < nblks; i++) {
393 407 dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
394 408 if (db == NULL) {
395 409 rw_exit(&dn->dn_struct_rwlock);
396 410 dmu_buf_rele_array(dbp, nblks, tag);
397 411 zio_nowait(zio);
398 412 return (EIO);
399 413 }
400 414 /* initiate async i/o */
401 415 if (read) {
402 416 (void) dbuf_read(db, zio, dbuf_flags);
403 417 }
404 418 dbp[i] = &db->db;
405 419 }
406 420 rw_exit(&dn->dn_struct_rwlock);
407 421
408 422 /* wait for async i/o */
409 423 err = zio_wait(zio);
410 424 /* track read overhead when we are in sync context */
411 425 if (dp && dsl_pool_sync_context(dp))
412 426 dp->dp_read_overhead += gethrtime() - start;
413 427 if (err) {
414 428 dmu_buf_rele_array(dbp, nblks, tag);
415 429 return (err);
416 430 }
417 431
418 432 /* wait for other io to complete */
419 433 if (read) {
420 434 for (i = 0; i < nblks; i++) {
421 435 dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
422 436 mutex_enter(&db->db_mtx);
423 437 while (db->db_state == DB_READ ||
424 438 db->db_state == DB_FILL)
425 439 cv_wait(&db->db_changed, &db->db_mtx);
426 440 if (db->db_state == DB_UNCACHED)
427 441 err = EIO;
428 442 mutex_exit(&db->db_mtx);
429 443 if (err) {
430 444 dmu_buf_rele_array(dbp, nblks, tag);
431 445 return (err);
432 446 }
433 447 }
434 448 }
435 449
436 450 *numbufsp = nblks;
437 451 *dbpp = dbp;
438 452 return (0);
439 453 }
440 454
441 455 static int
442 456 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
443 457 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
444 458 {
445 459 dnode_t *dn;
446 460 int err;
447 461
448 462 err = dnode_hold(os, object, FTAG, &dn);
449 463 if (err)
450 464 return (err);
451 465
452 466 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
453 467 numbufsp, dbpp, DMU_READ_PREFETCH);
454 468
455 469 dnode_rele(dn, FTAG);
456 470
457 471 return (err);
458 472 }
459 473
460 474 int
461 475 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
462 476 uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
463 477 {
464 478 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
465 479 dnode_t *dn;
466 480 int err;
467 481
468 482 DB_DNODE_ENTER(db);
469 483 dn = DB_DNODE(db);
470 484 err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
471 485 numbufsp, dbpp, DMU_READ_PREFETCH);
472 486 DB_DNODE_EXIT(db);
473 487
474 488 return (err);
475 489 }
476 490
477 491 void
478 492 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
479 493 {
480 494 int i;
481 495 dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
482 496
483 497 if (numbufs == 0)
484 498 return;
485 499
486 500 for (i = 0; i < numbufs; i++) {
487 501 if (dbp[i])
488 502 dbuf_rele(dbp[i], tag);
489 503 }
490 504
491 505 kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
492 506 }
493 507
494 508 void
495 509 dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
496 510 {
497 511 dnode_t *dn;
498 512 uint64_t blkid;
499 513 int nblks, i, err;
500 514
501 515 if (zfs_prefetch_disable)
502 516 return;
503 517
504 518 if (len == 0) { /* they're interested in the bonus buffer */
505 519 dn = DMU_META_DNODE(os);
506 520
507 521 if (object == 0 || object >= DN_MAX_OBJECT)
508 522 return;
509 523
510 524 rw_enter(&dn->dn_struct_rwlock, RW_READER);
511 525 blkid = dbuf_whichblock(dn, object * sizeof (dnode_phys_t));
512 526 dbuf_prefetch(dn, blkid);
513 527 rw_exit(&dn->dn_struct_rwlock);
514 528 return;
515 529 }
516 530
517 531 /*
518 532 * XXX - Note, if the dnode for the requested object is not
519 533 * already cached, we will do a *synchronous* read in the
520 534 * dnode_hold() call. The same is true for any indirects.
521 535 */
522 536 err = dnode_hold(os, object, FTAG, &dn);
523 537 if (err != 0)
524 538 return;
525 539
526 540 rw_enter(&dn->dn_struct_rwlock, RW_READER);
527 541 if (dn->dn_datablkshift) {
528 542 int blkshift = dn->dn_datablkshift;
529 543 nblks = (P2ROUNDUP(offset+len, 1<<blkshift) -
530 544 P2ALIGN(offset, 1<<blkshift)) >> blkshift;
531 545 } else {
532 546 nblks = (offset < dn->dn_datablksz);
533 547 }
534 548
535 549 if (nblks != 0) {
536 550 blkid = dbuf_whichblock(dn, offset);
537 551 for (i = 0; i < nblks; i++)
538 552 dbuf_prefetch(dn, blkid+i);
539 553 }
540 554
541 555 rw_exit(&dn->dn_struct_rwlock);
542 556
543 557 dnode_rele(dn, FTAG);
544 558 }
545 559
546 560 /*
547 561 * Get the next "chunk" of file data to free. We traverse the file from
548 562 * the end so that the file gets shorter over time (if we crashes in the
549 563 * middle, this will leave us in a better state). We find allocated file
550 564 * data by simply searching the allocated level 1 indirects.
551 565 */
552 566 static int
553 567 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
554 568 {
555 569 uint64_t len = *start - limit;
556 570 uint64_t blkcnt = 0;
557 571 uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
558 572 uint64_t iblkrange =
559 573 dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
560 574
561 575 ASSERT(limit <= *start);
562 576
563 577 if (len <= iblkrange * maxblks) {
564 578 *start = limit;
565 579 return (0);
566 580 }
567 581 ASSERT(ISP2(iblkrange));
568 582
569 583 while (*start > limit && blkcnt < maxblks) {
570 584 int err;
571 585
572 586 /* find next allocated L1 indirect */
573 587 err = dnode_next_offset(dn,
574 588 DNODE_FIND_BACKWARDS, start, 2, 1, 0);
575 589
576 590 /* if there are no more, then we are done */
577 591 if (err == ESRCH) {
578 592 *start = limit;
579 593 return (0);
580 594 } else if (err) {
581 595 return (err);
582 596 }
583 597 blkcnt += 1;
584 598
585 599 /* reset offset to end of "next" block back */
586 600 *start = P2ALIGN(*start, iblkrange);
587 601 if (*start <= limit)
588 602 *start = limit;
589 603 else
590 604 *start -= 1;
591 605 }
592 606 return (0);
593 607 }
594 608
595 609 static int
596 610 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
597 611 uint64_t length, boolean_t free_dnode)
598 612 {
599 613 dmu_tx_t *tx;
600 614 uint64_t object_size, start, end, len;
601 615 boolean_t trunc = (length == DMU_OBJECT_END);
602 616 int align, err;
603 617
604 618 align = 1 << dn->dn_datablkshift;
605 619 ASSERT(align > 0);
606 620 object_size = align == 1 ? dn->dn_datablksz :
607 621 (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
608 622
609 623 end = offset + length;
610 624 if (trunc || end > object_size)
611 625 end = object_size;
612 626 if (end <= offset)
613 627 return (0);
614 628 length = end - offset;
615 629
616 630 while (length) {
617 631 start = end;
618 632 /* assert(offset <= start) */
619 633 err = get_next_chunk(dn, &start, offset);
620 634 if (err)
621 635 return (err);
622 636 len = trunc ? DMU_OBJECT_END : end - start;
623 637
624 638 tx = dmu_tx_create(os);
625 639 dmu_tx_hold_free(tx, dn->dn_object, start, len);
626 640 err = dmu_tx_assign(tx, TXG_WAIT);
627 641 if (err) {
628 642 dmu_tx_abort(tx);
629 643 return (err);
630 644 }
631 645
632 646 dnode_free_range(dn, start, trunc ? -1 : len, tx);
633 647
634 648 if (start == 0 && free_dnode) {
635 649 ASSERT(trunc);
636 650 dnode_free(dn, tx);
637 651 }
638 652
639 653 length -= end - start;
640 654
641 655 dmu_tx_commit(tx);
642 656 end = start;
643 657 }
644 658 return (0);
645 659 }
646 660
647 661 int
648 662 dmu_free_long_range(objset_t *os, uint64_t object,
649 663 uint64_t offset, uint64_t length)
650 664 {
651 665 dnode_t *dn;
652 666 int err;
653 667
654 668 err = dnode_hold(os, object, FTAG, &dn);
655 669 if (err != 0)
656 670 return (err);
657 671 err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
658 672 dnode_rele(dn, FTAG);
659 673 return (err);
660 674 }
661 675
662 676 int
663 677 dmu_free_object(objset_t *os, uint64_t object)
664 678 {
665 679 dnode_t *dn;
666 680 dmu_tx_t *tx;
667 681 int err;
668 682
669 683 err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
670 684 FTAG, &dn);
671 685 if (err != 0)
672 686 return (err);
673 687 if (dn->dn_nlevels == 1) {
674 688 tx = dmu_tx_create(os);
675 689 dmu_tx_hold_bonus(tx, object);
676 690 dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
677 691 err = dmu_tx_assign(tx, TXG_WAIT);
678 692 if (err == 0) {
679 693 dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
680 694 dnode_free(dn, tx);
681 695 dmu_tx_commit(tx);
682 696 } else {
683 697 dmu_tx_abort(tx);
684 698 }
685 699 } else {
686 700 err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
687 701 }
688 702 dnode_rele(dn, FTAG);
689 703 return (err);
690 704 }
691 705
692 706 int
693 707 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
694 708 uint64_t size, dmu_tx_t *tx)
695 709 {
696 710 dnode_t *dn;
697 711 int err = dnode_hold(os, object, FTAG, &dn);
698 712 if (err)
699 713 return (err);
700 714 ASSERT(offset < UINT64_MAX);
701 715 ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
702 716 dnode_free_range(dn, offset, size, tx);
703 717 dnode_rele(dn, FTAG);
704 718 return (0);
705 719 }
706 720
707 721 int
708 722 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
709 723 void *buf, uint32_t flags)
710 724 {
711 725 dnode_t *dn;
712 726 dmu_buf_t **dbp;
713 727 int numbufs, err;
714 728
715 729 err = dnode_hold(os, object, FTAG, &dn);
716 730 if (err)
717 731 return (err);
718 732
719 733 /*
720 734 * Deal with odd block sizes, where there can't be data past the first
721 735 * block. If we ever do the tail block optimization, we will need to
722 736 * handle that here as well.
723 737 */
724 738 if (dn->dn_maxblkid == 0) {
725 739 int newsz = offset > dn->dn_datablksz ? 0 :
726 740 MIN(size, dn->dn_datablksz - offset);
727 741 bzero((char *)buf + newsz, size - newsz);
728 742 size = newsz;
729 743 }
730 744
731 745 while (size > 0) {
732 746 uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
733 747 int i;
734 748
735 749 /*
736 750 * NB: we could do this block-at-a-time, but it's nice
737 751 * to be reading in parallel.
738 752 */
739 753 err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
740 754 TRUE, FTAG, &numbufs, &dbp, flags);
741 755 if (err)
742 756 break;
743 757
744 758 for (i = 0; i < numbufs; i++) {
745 759 int tocpy;
746 760 int bufoff;
747 761 dmu_buf_t *db = dbp[i];
748 762
749 763 ASSERT(size > 0);
750 764
751 765 bufoff = offset - db->db_offset;
752 766 tocpy = (int)MIN(db->db_size - bufoff, size);
753 767
754 768 bcopy((char *)db->db_data + bufoff, buf, tocpy);
755 769
756 770 offset += tocpy;
757 771 size -= tocpy;
758 772 buf = (char *)buf + tocpy;
759 773 }
760 774 dmu_buf_rele_array(dbp, numbufs, FTAG);
761 775 }
762 776 dnode_rele(dn, FTAG);
763 777 return (err);
764 778 }
765 779
766 780 void
767 781 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
768 782 const void *buf, dmu_tx_t *tx)
769 783 {
770 784 dmu_buf_t **dbp;
771 785 int numbufs, i;
772 786
773 787 if (size == 0)
774 788 return;
775 789
776 790 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
777 791 FALSE, FTAG, &numbufs, &dbp));
778 792
779 793 for (i = 0; i < numbufs; i++) {
780 794 int tocpy;
781 795 int bufoff;
782 796 dmu_buf_t *db = dbp[i];
783 797
784 798 ASSERT(size > 0);
785 799
786 800 bufoff = offset - db->db_offset;
787 801 tocpy = (int)MIN(db->db_size - bufoff, size);
788 802
789 803 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
790 804
791 805 if (tocpy == db->db_size)
792 806 dmu_buf_will_fill(db, tx);
793 807 else
794 808 dmu_buf_will_dirty(db, tx);
795 809
796 810 bcopy(buf, (char *)db->db_data + bufoff, tocpy);
797 811
798 812 if (tocpy == db->db_size)
799 813 dmu_buf_fill_done(db, tx);
800 814
801 815 offset += tocpy;
802 816 size -= tocpy;
803 817 buf = (char *)buf + tocpy;
804 818 }
805 819 dmu_buf_rele_array(dbp, numbufs, FTAG);
806 820 }
807 821
808 822 void
809 823 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
810 824 dmu_tx_t *tx)
811 825 {
812 826 dmu_buf_t **dbp;
813 827 int numbufs, i;
814 828
815 829 if (size == 0)
816 830 return;
817 831
818 832 VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
819 833 FALSE, FTAG, &numbufs, &dbp));
820 834
821 835 for (i = 0; i < numbufs; i++) {
822 836 dmu_buf_t *db = dbp[i];
823 837
824 838 dmu_buf_will_not_fill(db, tx);
825 839 }
826 840 dmu_buf_rele_array(dbp, numbufs, FTAG);
827 841 }
828 842
829 843 /*
830 844 * DMU support for xuio
831 845 */
832 846 kstat_t *xuio_ksp = NULL;
833 847
834 848 int
835 849 dmu_xuio_init(xuio_t *xuio, int nblk)
836 850 {
837 851 dmu_xuio_t *priv;
838 852 uio_t *uio = &xuio->xu_uio;
839 853
840 854 uio->uio_iovcnt = nblk;
841 855 uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
842 856
843 857 priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
844 858 priv->cnt = nblk;
845 859 priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
846 860 priv->iovp = uio->uio_iov;
847 861 XUIO_XUZC_PRIV(xuio) = priv;
848 862
849 863 if (XUIO_XUZC_RW(xuio) == UIO_READ)
850 864 XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
851 865 else
852 866 XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
853 867
854 868 return (0);
855 869 }
856 870
857 871 void
858 872 dmu_xuio_fini(xuio_t *xuio)
859 873 {
860 874 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
861 875 int nblk = priv->cnt;
862 876
863 877 kmem_free(priv->iovp, nblk * sizeof (iovec_t));
864 878 kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
865 879 kmem_free(priv, sizeof (dmu_xuio_t));
866 880
867 881 if (XUIO_XUZC_RW(xuio) == UIO_READ)
868 882 XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
869 883 else
870 884 XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
871 885 }
872 886
873 887 /*
874 888 * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
875 889 * and increase priv->next by 1.
876 890 */
877 891 int
878 892 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
879 893 {
880 894 struct iovec *iov;
881 895 uio_t *uio = &xuio->xu_uio;
882 896 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
883 897 int i = priv->next++;
884 898
885 899 ASSERT(i < priv->cnt);
886 900 ASSERT(off + n <= arc_buf_size(abuf));
887 901 iov = uio->uio_iov + i;
888 902 iov->iov_base = (char *)abuf->b_data + off;
889 903 iov->iov_len = n;
890 904 priv->bufs[i] = abuf;
891 905 return (0);
892 906 }
893 907
894 908 int
895 909 dmu_xuio_cnt(xuio_t *xuio)
896 910 {
897 911 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
898 912 return (priv->cnt);
899 913 }
900 914
901 915 arc_buf_t *
902 916 dmu_xuio_arcbuf(xuio_t *xuio, int i)
903 917 {
904 918 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
905 919
906 920 ASSERT(i < priv->cnt);
907 921 return (priv->bufs[i]);
908 922 }
909 923
910 924 void
911 925 dmu_xuio_clear(xuio_t *xuio, int i)
912 926 {
913 927 dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
914 928
915 929 ASSERT(i < priv->cnt);
916 930 priv->bufs[i] = NULL;
917 931 }
918 932
919 933 static void
920 934 xuio_stat_init(void)
921 935 {
922 936 xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
923 937 KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
924 938 KSTAT_FLAG_VIRTUAL);
925 939 if (xuio_ksp != NULL) {
926 940 xuio_ksp->ks_data = &xuio_stats;
927 941 kstat_install(xuio_ksp);
928 942 }
929 943 }
930 944
931 945 static void
932 946 xuio_stat_fini(void)
933 947 {
934 948 if (xuio_ksp != NULL) {
935 949 kstat_delete(xuio_ksp);
936 950 xuio_ksp = NULL;
937 951 }
938 952 }
939 953
940 954 void
941 955 xuio_stat_wbuf_copied()
942 956 {
943 957 XUIOSTAT_BUMP(xuiostat_wbuf_copied);
944 958 }
945 959
946 960 void
947 961 xuio_stat_wbuf_nocopy()
948 962 {
949 963 XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
950 964 }
951 965
952 966 #ifdef _KERNEL
953 967 int
954 968 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
955 969 {
956 970 dmu_buf_t **dbp;
957 971 int numbufs, i, err;
958 972 xuio_t *xuio = NULL;
959 973
960 974 /*
961 975 * NB: we could do this block-at-a-time, but it's nice
962 976 * to be reading in parallel.
963 977 */
964 978 err = dmu_buf_hold_array(os, object, uio->uio_loffset, size, TRUE, FTAG,
965 979 &numbufs, &dbp);
966 980 if (err)
967 981 return (err);
968 982
969 983 if (uio->uio_extflg == UIO_XUIO)
970 984 xuio = (xuio_t *)uio;
971 985
972 986 for (i = 0; i < numbufs; i++) {
973 987 int tocpy;
974 988 int bufoff;
975 989 dmu_buf_t *db = dbp[i];
976 990
977 991 ASSERT(size > 0);
978 992
979 993 bufoff = uio->uio_loffset - db->db_offset;
980 994 tocpy = (int)MIN(db->db_size - bufoff, size);
981 995
982 996 if (xuio) {
983 997 dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
984 998 arc_buf_t *dbuf_abuf = dbi->db_buf;
985 999 arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
986 1000 err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
987 1001 if (!err) {
988 1002 uio->uio_resid -= tocpy;
989 1003 uio->uio_loffset += tocpy;
990 1004 }
991 1005
992 1006 if (abuf == dbuf_abuf)
993 1007 XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
994 1008 else
995 1009 XUIOSTAT_BUMP(xuiostat_rbuf_copied);
996 1010 } else {
997 1011 err = uiomove((char *)db->db_data + bufoff, tocpy,
998 1012 UIO_READ, uio);
999 1013 }
1000 1014 if (err)
1001 1015 break;
1002 1016
1003 1017 size -= tocpy;
1004 1018 }
1005 1019 dmu_buf_rele_array(dbp, numbufs, FTAG);
1006 1020
1007 1021 return (err);
1008 1022 }
1009 1023
1010 1024 static int
1011 1025 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
1012 1026 {
1013 1027 dmu_buf_t **dbp;
1014 1028 int numbufs;
1015 1029 int err = 0;
1016 1030 int i;
1017 1031
1018 1032 err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
1019 1033 FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
1020 1034 if (err)
1021 1035 return (err);
1022 1036
1023 1037 for (i = 0; i < numbufs; i++) {
1024 1038 int tocpy;
1025 1039 int bufoff;
1026 1040 dmu_buf_t *db = dbp[i];
1027 1041
1028 1042 ASSERT(size > 0);
1029 1043
1030 1044 bufoff = uio->uio_loffset - db->db_offset;
1031 1045 tocpy = (int)MIN(db->db_size - bufoff, size);
1032 1046
1033 1047 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1034 1048
1035 1049 if (tocpy == db->db_size)
1036 1050 dmu_buf_will_fill(db, tx);
1037 1051 else
1038 1052 dmu_buf_will_dirty(db, tx);
1039 1053
1040 1054 /*
1041 1055 * XXX uiomove could block forever (eg. nfs-backed
1042 1056 * pages). There needs to be a uiolockdown() function
1043 1057 * to lock the pages in memory, so that uiomove won't
1044 1058 * block.
1045 1059 */
1046 1060 err = uiomove((char *)db->db_data + bufoff, tocpy,
1047 1061 UIO_WRITE, uio);
1048 1062
1049 1063 if (tocpy == db->db_size)
1050 1064 dmu_buf_fill_done(db, tx);
1051 1065
1052 1066 if (err)
1053 1067 break;
1054 1068
1055 1069 size -= tocpy;
1056 1070 }
1057 1071
1058 1072 dmu_buf_rele_array(dbp, numbufs, FTAG);
1059 1073 return (err);
1060 1074 }
1061 1075
1062 1076 int
1063 1077 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
1064 1078 dmu_tx_t *tx)
1065 1079 {
1066 1080 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
1067 1081 dnode_t *dn;
1068 1082 int err;
1069 1083
1070 1084 if (size == 0)
1071 1085 return (0);
1072 1086
1073 1087 DB_DNODE_ENTER(db);
1074 1088 dn = DB_DNODE(db);
1075 1089 err = dmu_write_uio_dnode(dn, uio, size, tx);
1076 1090 DB_DNODE_EXIT(db);
1077 1091
1078 1092 return (err);
1079 1093 }
1080 1094
1081 1095 int
1082 1096 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
1083 1097 dmu_tx_t *tx)
1084 1098 {
1085 1099 dnode_t *dn;
1086 1100 int err;
1087 1101
1088 1102 if (size == 0)
1089 1103 return (0);
1090 1104
1091 1105 err = dnode_hold(os, object, FTAG, &dn);
1092 1106 if (err)
1093 1107 return (err);
1094 1108
1095 1109 err = dmu_write_uio_dnode(dn, uio, size, tx);
1096 1110
1097 1111 dnode_rele(dn, FTAG);
1098 1112
1099 1113 return (err);
1100 1114 }
1101 1115
1102 1116 int
1103 1117 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
1104 1118 page_t *pp, dmu_tx_t *tx)
1105 1119 {
1106 1120 dmu_buf_t **dbp;
1107 1121 int numbufs, i;
1108 1122 int err;
1109 1123
1110 1124 if (size == 0)
1111 1125 return (0);
1112 1126
1113 1127 err = dmu_buf_hold_array(os, object, offset, size,
1114 1128 FALSE, FTAG, &numbufs, &dbp);
1115 1129 if (err)
1116 1130 return (err);
1117 1131
1118 1132 for (i = 0; i < numbufs; i++) {
1119 1133 int tocpy, copied, thiscpy;
1120 1134 int bufoff;
1121 1135 dmu_buf_t *db = dbp[i];
1122 1136 caddr_t va;
1123 1137
1124 1138 ASSERT(size > 0);
1125 1139 ASSERT3U(db->db_size, >=, PAGESIZE);
1126 1140
1127 1141 bufoff = offset - db->db_offset;
1128 1142 tocpy = (int)MIN(db->db_size - bufoff, size);
1129 1143
1130 1144 ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
1131 1145
1132 1146 if (tocpy == db->db_size)
1133 1147 dmu_buf_will_fill(db, tx);
1134 1148 else
1135 1149 dmu_buf_will_dirty(db, tx);
1136 1150
1137 1151 for (copied = 0; copied < tocpy; copied += PAGESIZE) {
1138 1152 ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
1139 1153 thiscpy = MIN(PAGESIZE, tocpy - copied);
1140 1154 va = zfs_map_page(pp, S_READ);
1141 1155 bcopy(va, (char *)db->db_data + bufoff, thiscpy);
1142 1156 zfs_unmap_page(pp, va);
1143 1157 pp = pp->p_next;
1144 1158 bufoff += PAGESIZE;
1145 1159 }
1146 1160
1147 1161 if (tocpy == db->db_size)
1148 1162 dmu_buf_fill_done(db, tx);
1149 1163
1150 1164 offset += tocpy;
1151 1165 size -= tocpy;
1152 1166 }
1153 1167 dmu_buf_rele_array(dbp, numbufs, FTAG);
1154 1168 return (err);
1155 1169 }
1156 1170 #endif
1157 1171
1158 1172 /*
1159 1173 * Allocate a loaned anonymous arc buffer.
1160 1174 */
1161 1175 arc_buf_t *
1162 1176 dmu_request_arcbuf(dmu_buf_t *handle, int size)
1163 1177 {
1164 1178 dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
1165 1179 spa_t *spa;
1166 1180
1167 1181 DB_GET_SPA(&spa, db);
1168 1182 return (arc_loan_buf(spa, size));
1169 1183 }
1170 1184
1171 1185 /*
1172 1186 * Free a loaned arc buffer.
1173 1187 */
1174 1188 void
1175 1189 dmu_return_arcbuf(arc_buf_t *buf)
1176 1190 {
1177 1191 arc_return_buf(buf, FTAG);
1178 1192 VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
1179 1193 }
1180 1194
1181 1195 /*
1182 1196 * When possible directly assign passed loaned arc buffer to a dbuf.
1183 1197 * If this is not possible copy the contents of passed arc buf via
1184 1198 * dmu_write().
1185 1199 */
1186 1200 void
1187 1201 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
1188 1202 dmu_tx_t *tx)
1189 1203 {
1190 1204 dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
1191 1205 dnode_t *dn;
1192 1206 dmu_buf_impl_t *db;
1193 1207 uint32_t blksz = (uint32_t)arc_buf_size(buf);
1194 1208 uint64_t blkid;
1195 1209
1196 1210 DB_DNODE_ENTER(dbuf);
1197 1211 dn = DB_DNODE(dbuf);
1198 1212 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1199 1213 blkid = dbuf_whichblock(dn, offset);
1200 1214 VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
1201 1215 rw_exit(&dn->dn_struct_rwlock);
1202 1216 DB_DNODE_EXIT(dbuf);
1203 1217
1204 1218 if (offset == db->db.db_offset && blksz == db->db.db_size) {
1205 1219 dbuf_assign_arcbuf(db, buf, tx);
1206 1220 dbuf_rele(db, FTAG);
1207 1221 } else {
1208 1222 objset_t *os;
1209 1223 uint64_t object;
1210 1224
1211 1225 DB_DNODE_ENTER(dbuf);
1212 1226 dn = DB_DNODE(dbuf);
1213 1227 os = dn->dn_objset;
1214 1228 object = dn->dn_object;
1215 1229 DB_DNODE_EXIT(dbuf);
1216 1230
1217 1231 dbuf_rele(db, FTAG);
1218 1232 dmu_write(os, object, offset, blksz, buf->b_data, tx);
1219 1233 dmu_return_arcbuf(buf);
1220 1234 XUIOSTAT_BUMP(xuiostat_wbuf_copied);
1221 1235 }
1222 1236 }
1223 1237
1224 1238 typedef struct {
1225 1239 dbuf_dirty_record_t *dsa_dr;
1226 1240 dmu_sync_cb_t *dsa_done;
1227 1241 zgd_t *dsa_zgd;
1228 1242 dmu_tx_t *dsa_tx;
1229 1243 } dmu_sync_arg_t;
1230 1244
1231 1245 /* ARGSUSED */
1232 1246 static void
1233 1247 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
1234 1248 {
1235 1249 dmu_sync_arg_t *dsa = varg;
1236 1250 dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
1237 1251 blkptr_t *bp = zio->io_bp;
1238 1252
1239 1253 if (zio->io_error == 0) {
1240 1254 if (BP_IS_HOLE(bp)) {
1241 1255 /*
1242 1256 * A block of zeros may compress to a hole, but the
1243 1257 * block size still needs to be known for replay.
1244 1258 */
1245 1259 BP_SET_LSIZE(bp, db->db_size);
1246 1260 } else {
1247 1261 ASSERT(BP_GET_LEVEL(bp) == 0);
1248 1262 bp->blk_fill = 1;
1249 1263 }
1250 1264 }
1251 1265 }
1252 1266
1253 1267 static void
1254 1268 dmu_sync_late_arrival_ready(zio_t *zio)
1255 1269 {
1256 1270 dmu_sync_ready(zio, NULL, zio->io_private);
1257 1271 }
1258 1272
1259 1273 /* ARGSUSED */
1260 1274 static void
1261 1275 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
1262 1276 {
1263 1277 dmu_sync_arg_t *dsa = varg;
1264 1278 dbuf_dirty_record_t *dr = dsa->dsa_dr;
1265 1279 dmu_buf_impl_t *db = dr->dr_dbuf;
1266 1280
1267 1281 mutex_enter(&db->db_mtx);
1268 1282 ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
1269 1283 if (zio->io_error == 0) {
1270 1284 dr->dt.dl.dr_overridden_by = *zio->io_bp;
1271 1285 dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
1272 1286 dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
1273 1287 if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
1274 1288 BP_ZERO(&dr->dt.dl.dr_overridden_by);
1275 1289 } else {
1276 1290 dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
1277 1291 }
1278 1292 cv_broadcast(&db->db_changed);
1279 1293 mutex_exit(&db->db_mtx);
1280 1294
1281 1295 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1282 1296
1283 1297 kmem_free(dsa, sizeof (*dsa));
1284 1298 }
1285 1299
1286 1300 static void
1287 1301 dmu_sync_late_arrival_done(zio_t *zio)
1288 1302 {
1289 1303 blkptr_t *bp = zio->io_bp;
1290 1304 dmu_sync_arg_t *dsa = zio->io_private;
1291 1305
1292 1306 if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
1293 1307 ASSERT(zio->io_bp->blk_birth == zio->io_txg);
1294 1308 ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
1295 1309 zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
1296 1310 }
1297 1311
1298 1312 dmu_tx_commit(dsa->dsa_tx);
1299 1313
1300 1314 dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
1301 1315
1302 1316 kmem_free(dsa, sizeof (*dsa));
1303 1317 }
1304 1318
1305 1319 static int
1306 1320 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
1307 1321 zio_prop_t *zp, zbookmark_t *zb)
1308 1322 {
1309 1323 dmu_sync_arg_t *dsa;
1310 1324 dmu_tx_t *tx;
1311 1325
1312 1326 tx = dmu_tx_create(os);
1313 1327 dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
1314 1328 if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
1315 1329 dmu_tx_abort(tx);
1316 1330 return (EIO); /* Make zl_get_data do txg_waited_synced() */
1317 1331 }
1318 1332
1319 1333 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1320 1334 dsa->dsa_dr = NULL;
1321 1335 dsa->dsa_done = done;
1322 1336 dsa->dsa_zgd = zgd;
1323 1337 dsa->dsa_tx = tx;
1324 1338
1325 1339 zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
1326 1340 zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
1327 1341 dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
1328 1342 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
1329 1343
1330 1344 return (0);
1331 1345 }
1332 1346
1333 1347 /*
1334 1348 * Intent log support: sync the block associated with db to disk.
1335 1349 * N.B. and XXX: the caller is responsible for making sure that the
1336 1350 * data isn't changing while dmu_sync() is writing it.
1337 1351 *
1338 1352 * Return values:
1339 1353 *
1340 1354 * EEXIST: this txg has already been synced, so there's nothing to to.
1341 1355 * The caller should not log the write.
1342 1356 *
1343 1357 * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
1344 1358 * The caller should not log the write.
1345 1359 *
1346 1360 * EALREADY: this block is already in the process of being synced.
1347 1361 * The caller should track its progress (somehow).
1348 1362 *
1349 1363 * EIO: could not do the I/O.
1350 1364 * The caller should do a txg_wait_synced().
1351 1365 *
1352 1366 * 0: the I/O has been initiated.
1353 1367 * The caller should log this blkptr in the done callback.
1354 1368 * It is possible that the I/O will fail, in which case
1355 1369 * the error will be reported to the done callback and
1356 1370 * propagated to pio from zio_done().
1357 1371 */
1358 1372 int
1359 1373 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
1360 1374 {
1361 1375 blkptr_t *bp = zgd->zgd_bp;
1362 1376 dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
1363 1377 objset_t *os = db->db_objset;
1364 1378 dsl_dataset_t *ds = os->os_dsl_dataset;
1365 1379 dbuf_dirty_record_t *dr;
1366 1380 dmu_sync_arg_t *dsa;
1367 1381 zbookmark_t zb;
1368 1382 zio_prop_t zp;
1369 1383 dnode_t *dn;
1370 1384
1371 1385 ASSERT(pio != NULL);
1372 1386 ASSERT(BP_IS_HOLE(bp));
1373 1387 ASSERT(txg != 0);
1374 1388
1375 1389 SET_BOOKMARK(&zb, ds->ds_object,
1376 1390 db->db.db_object, db->db_level, db->db_blkid);
1377 1391
1378 1392 DB_DNODE_ENTER(db);
1379 1393 dn = DB_DNODE(db);
1380 1394 dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
1381 1395 DB_DNODE_EXIT(db);
1382 1396
1383 1397 /*
1384 1398 * If we're frozen (running ziltest), we always need to generate a bp.
1385 1399 */
1386 1400 if (txg > spa_freeze_txg(os->os_spa))
1387 1401 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1388 1402
1389 1403 /*
1390 1404 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
1391 1405 * and us. If we determine that this txg is not yet syncing,
1392 1406 * but it begins to sync a moment later, that's OK because the
1393 1407 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
1394 1408 */
1395 1409 mutex_enter(&db->db_mtx);
1396 1410
1397 1411 if (txg <= spa_last_synced_txg(os->os_spa)) {
1398 1412 /*
1399 1413 * This txg has already synced. There's nothing to do.
1400 1414 */
1401 1415 mutex_exit(&db->db_mtx);
1402 1416 return (EEXIST);
1403 1417 }
1404 1418
1405 1419 if (txg <= spa_syncing_txg(os->os_spa)) {
1406 1420 /*
1407 1421 * This txg is currently syncing, so we can't mess with
1408 1422 * the dirty record anymore; just write a new log block.
1409 1423 */
1410 1424 mutex_exit(&db->db_mtx);
1411 1425 return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
1412 1426 }
1413 1427
1414 1428 dr = db->db_last_dirty;
1415 1429 while (dr && dr->dr_txg != txg)
1416 1430 dr = dr->dr_next;
1417 1431
1418 1432 if (dr == NULL) {
1419 1433 /*
1420 1434 * There's no dr for this dbuf, so it must have been freed.
1421 1435 * There's no need to log writes to freed blocks, so we're done.
1422 1436 */
1423 1437 mutex_exit(&db->db_mtx);
1424 1438 return (ENOENT);
1425 1439 }
1426 1440
1427 1441 ASSERT(dr->dr_txg == txg);
1428 1442 if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
1429 1443 dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
1430 1444 /*
1431 1445 * We have already issued a sync write for this buffer,
1432 1446 * or this buffer has already been synced. It could not
1433 1447 * have been dirtied since, or we would have cleared the state.
1434 1448 */
1435 1449 mutex_exit(&db->db_mtx);
1436 1450 return (EALREADY);
1437 1451 }
1438 1452
1439 1453 ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
1440 1454 dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
1441 1455 mutex_exit(&db->db_mtx);
1442 1456
1443 1457 dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
1444 1458 dsa->dsa_dr = dr;
1445 1459 dsa->dsa_done = done;
1446 1460 dsa->dsa_zgd = zgd;
1447 1461 dsa->dsa_tx = NULL;
1448 1462
1449 1463 zio_nowait(arc_write(pio, os->os_spa, txg,
1450 1464 bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
1451 1465 dmu_sync_ready, dmu_sync_done, dsa,
1452 1466 ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
1453 1467
1454 1468 return (0);
1455 1469 }
1456 1470
1457 1471 int
1458 1472 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
1459 1473 dmu_tx_t *tx)
1460 1474 {
1461 1475 dnode_t *dn;
1462 1476 int err;
1463 1477
1464 1478 err = dnode_hold(os, object, FTAG, &dn);
1465 1479 if (err)
1466 1480 return (err);
1467 1481 err = dnode_set_blksz(dn, size, ibs, tx);
1468 1482 dnode_rele(dn, FTAG);
1469 1483 return (err);
1470 1484 }
1471 1485
1472 1486 void
1473 1487 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
1474 1488 dmu_tx_t *tx)
1475 1489 {
1476 1490 dnode_t *dn;
1477 1491
1478 1492 /* XXX assumes dnode_hold will not get an i/o error */
1479 1493 (void) dnode_hold(os, object, FTAG, &dn);
1480 1494 ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
1481 1495 dn->dn_checksum = checksum;
1482 1496 dnode_setdirty(dn, tx);
1483 1497 dnode_rele(dn, FTAG);
1484 1498 }
1485 1499
1486 1500 void
1487 1501 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
1488 1502 dmu_tx_t *tx)
1489 1503 {
1490 1504 dnode_t *dn;
1491 1505
1492 1506 /* XXX assumes dnode_hold will not get an i/o error */
1493 1507 (void) dnode_hold(os, object, FTAG, &dn);
1494 1508 ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
1495 1509 dn->dn_compress = compress;
|
↓ open down ↓ |
1306 lines elided |
↑ open up ↑ |
1496 1510 dnode_setdirty(dn, tx);
1497 1511 dnode_rele(dn, FTAG);
1498 1512 }
1499 1513
1500 1514 int zfs_mdcomp_disable = 0;
1501 1515
1502 1516 void
1503 1517 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
1504 1518 {
1505 1519 dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
1506 - boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata ||
1520 + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
1507 1521 (wp & WP_SPILL));
1508 1522 enum zio_checksum checksum = os->os_checksum;
1509 1523 enum zio_compress compress = os->os_compress;
1510 1524 enum zio_checksum dedup_checksum = os->os_dedup_checksum;
1511 1525 boolean_t dedup;
1512 1526 boolean_t dedup_verify = os->os_dedup_verify;
1513 1527 int copies = os->os_copies;
1514 1528
1515 1529 /*
1516 1530 * Determine checksum setting.
1517 1531 */
1518 1532 if (ismd) {
1519 1533 /*
1520 1534 * Metadata always gets checksummed. If the data
1521 1535 * checksum is multi-bit correctable, and it's not a
1522 1536 * ZBT-style checksum, then it's suitable for metadata
1523 1537 * as well. Otherwise, the metadata checksum defaults
1524 1538 * to fletcher4.
1525 1539 */
1526 1540 if (zio_checksum_table[checksum].ci_correctable < 1 ||
1527 1541 zio_checksum_table[checksum].ci_eck)
1528 1542 checksum = ZIO_CHECKSUM_FLETCHER_4;
1529 1543 } else {
1530 1544 checksum = zio_checksum_select(dn->dn_checksum, checksum);
1531 1545 }
1532 1546
1533 1547 /*
1534 1548 * Determine compression setting.
1535 1549 */
1536 1550 if (ismd) {
1537 1551 /*
1538 1552 * XXX -- we should design a compression algorithm
1539 1553 * that specializes in arrays of bps.
1540 1554 */
1541 1555 compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
1542 1556 ZIO_COMPRESS_LZJB;
1543 1557 } else {
1544 1558 compress = zio_compress_select(dn->dn_compress, compress);
1545 1559 }
1546 1560
1547 1561 /*
1548 1562 * Determine dedup setting. If we are in dmu_sync(), we won't
1549 1563 * actually dedup now because that's all done in syncing context;
1550 1564 * but we do want to use the dedup checkum. If the checksum is not
1551 1565 * strong enough to ensure unique signatures, force dedup_verify.
1552 1566 */
1553 1567 dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
1554 1568 if (dedup) {
1555 1569 checksum = dedup_checksum;
1556 1570 if (!zio_checksum_table[checksum].ci_dedup)
1557 1571 dedup_verify = 1;
1558 1572 }
1559 1573
1560 1574 if (wp & WP_DMU_SYNC)
1561 1575 dedup = 0;
1562 1576
1563 1577 if (wp & WP_NOFILL) {
1564 1578 ASSERT(!ismd && level == 0);
1565 1579 checksum = ZIO_CHECKSUM_OFF;
1566 1580 compress = ZIO_COMPRESS_OFF;
1567 1581 dedup = B_FALSE;
1568 1582 }
1569 1583
1570 1584 zp->zp_checksum = checksum;
1571 1585 zp->zp_compress = compress;
1572 1586 zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
1573 1587 zp->zp_level = level;
1574 1588 zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
1575 1589 zp->zp_dedup = dedup;
1576 1590 zp->zp_dedup_verify = dedup && dedup_verify;
1577 1591 }
1578 1592
1579 1593 int
1580 1594 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
1581 1595 {
1582 1596 dnode_t *dn;
1583 1597 int i, err;
1584 1598
1585 1599 err = dnode_hold(os, object, FTAG, &dn);
1586 1600 if (err)
1587 1601 return (err);
1588 1602 /*
1589 1603 * Sync any current changes before
1590 1604 * we go trundling through the block pointers.
1591 1605 */
1592 1606 for (i = 0; i < TXG_SIZE; i++) {
1593 1607 if (list_link_active(&dn->dn_dirty_link[i]))
1594 1608 break;
1595 1609 }
1596 1610 if (i != TXG_SIZE) {
1597 1611 dnode_rele(dn, FTAG);
1598 1612 txg_wait_synced(dmu_objset_pool(os), 0);
1599 1613 err = dnode_hold(os, object, FTAG, &dn);
1600 1614 if (err)
1601 1615 return (err);
1602 1616 }
1603 1617
1604 1618 err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
1605 1619 dnode_rele(dn, FTAG);
1606 1620
1607 1621 return (err);
1608 1622 }
1609 1623
1610 1624 void
1611 1625 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
1612 1626 {
1613 1627 dnode_phys_t *dnp;
1614 1628
1615 1629 rw_enter(&dn->dn_struct_rwlock, RW_READER);
1616 1630 mutex_enter(&dn->dn_mtx);
1617 1631
1618 1632 dnp = dn->dn_phys;
1619 1633
1620 1634 doi->doi_data_block_size = dn->dn_datablksz;
1621 1635 doi->doi_metadata_block_size = dn->dn_indblkshift ?
1622 1636 1ULL << dn->dn_indblkshift : 0;
1623 1637 doi->doi_type = dn->dn_type;
1624 1638 doi->doi_bonus_type = dn->dn_bonustype;
1625 1639 doi->doi_bonus_size = dn->dn_bonuslen;
1626 1640 doi->doi_indirection = dn->dn_nlevels;
1627 1641 doi->doi_checksum = dn->dn_checksum;
1628 1642 doi->doi_compress = dn->dn_compress;
1629 1643 doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
1630 1644 doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
1631 1645 doi->doi_fill_count = 0;
1632 1646 for (int i = 0; i < dnp->dn_nblkptr; i++)
1633 1647 doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
1634 1648
1635 1649 mutex_exit(&dn->dn_mtx);
1636 1650 rw_exit(&dn->dn_struct_rwlock);
1637 1651 }
1638 1652
1639 1653 /*
1640 1654 * Get information on a DMU object.
1641 1655 * If doi is NULL, just indicates whether the object exists.
1642 1656 */
1643 1657 int
1644 1658 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
1645 1659 {
1646 1660 dnode_t *dn;
1647 1661 int err = dnode_hold(os, object, FTAG, &dn);
1648 1662
1649 1663 if (err)
1650 1664 return (err);
1651 1665
1652 1666 if (doi != NULL)
1653 1667 dmu_object_info_from_dnode(dn, doi);
1654 1668
1655 1669 dnode_rele(dn, FTAG);
1656 1670 return (0);
1657 1671 }
1658 1672
1659 1673 /*
1660 1674 * As above, but faster; can be used when you have a held dbuf in hand.
1661 1675 */
1662 1676 void
1663 1677 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
1664 1678 {
1665 1679 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1666 1680
1667 1681 DB_DNODE_ENTER(db);
1668 1682 dmu_object_info_from_dnode(DB_DNODE(db), doi);
1669 1683 DB_DNODE_EXIT(db);
1670 1684 }
1671 1685
1672 1686 /*
1673 1687 * Faster still when you only care about the size.
1674 1688 * This is specifically optimized for zfs_getattr().
1675 1689 */
1676 1690 void
1677 1691 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
1678 1692 u_longlong_t *nblk512)
1679 1693 {
1680 1694 dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
1681 1695 dnode_t *dn;
1682 1696
1683 1697 DB_DNODE_ENTER(db);
1684 1698 dn = DB_DNODE(db);
1685 1699
1686 1700 *blksize = dn->dn_datablksz;
1687 1701 /* add 1 for dnode space */
1688 1702 *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
1689 1703 SPA_MINBLOCKSHIFT) + 1;
1690 1704 DB_DNODE_EXIT(db);
1691 1705 }
1692 1706
1693 1707 void
1694 1708 byteswap_uint64_array(void *vbuf, size_t size)
1695 1709 {
1696 1710 uint64_t *buf = vbuf;
1697 1711 size_t count = size >> 3;
1698 1712 int i;
1699 1713
1700 1714 ASSERT((size & 7) == 0);
1701 1715
1702 1716 for (i = 0; i < count; i++)
1703 1717 buf[i] = BSWAP_64(buf[i]);
1704 1718 }
1705 1719
1706 1720 void
1707 1721 byteswap_uint32_array(void *vbuf, size_t size)
1708 1722 {
1709 1723 uint32_t *buf = vbuf;
1710 1724 size_t count = size >> 2;
1711 1725 int i;
1712 1726
1713 1727 ASSERT((size & 3) == 0);
1714 1728
1715 1729 for (i = 0; i < count; i++)
1716 1730 buf[i] = BSWAP_32(buf[i]);
1717 1731 }
1718 1732
1719 1733 void
1720 1734 byteswap_uint16_array(void *vbuf, size_t size)
1721 1735 {
1722 1736 uint16_t *buf = vbuf;
1723 1737 size_t count = size >> 1;
1724 1738 int i;
1725 1739
1726 1740 ASSERT((size & 1) == 0);
1727 1741
1728 1742 for (i = 0; i < count; i++)
1729 1743 buf[i] = BSWAP_16(buf[i]);
1730 1744 }
1731 1745
1732 1746 /* ARGSUSED */
1733 1747 void
1734 1748 byteswap_uint8_array(void *vbuf, size_t size)
1735 1749 {
1736 1750 }
1737 1751
1738 1752 void
1739 1753 dmu_init(void)
1740 1754 {
1741 1755 zfs_dbgmsg_init();
1742 1756 sa_cache_init();
1743 1757 xuio_stat_init();
1744 1758 dmu_objset_init();
1745 1759 dnode_init();
1746 1760 dbuf_init();
1747 1761 zfetch_init();
1748 1762 arc_init();
1749 1763 l2arc_init();
1750 1764 }
1751 1765
1752 1766 void
1753 1767 dmu_fini(void)
1754 1768 {
1755 1769 l2arc_fini();
1756 1770 arc_fini();
1757 1771 zfetch_fini();
1758 1772 dbuf_fini();
1759 1773 dnode_fini();
1760 1774 dmu_objset_fini();
1761 1775 xuio_stat_fini();
1762 1776 sa_cache_fini();
1763 1777 zfs_dbgmsg_fini();
1764 1778 }
|
↓ open down ↓ |
248 lines elided |
↑ open up ↑ |
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX