btrfs-transaction 0.13.0

Userspace transaction infrastructure for modifying btrfs filesystems
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
//! # In-memory filesystem state for a transaction session
//!
//! `Filesystem` is the central state object for modifying a btrfs filesystem. It
//! wraps a `BlockReader` (from `btrfs-disk`), holds the parsed superblock, all
//! tree root pointers, and tracks which blocks have been modified during the
//! current transaction.
//!
//! Open a device or image with [`Filesystem::open`], then use the read/write
//! methods to access tree blocks through `ExtentBuffer`.

use crate::buffer::ExtentBuffer;
use btrfs_disk::{
    chunk::ChunkTreeCache,
    items::DeviceItem,
    reader::{self, BlockReader, OpenFilesystem},
    superblock::{self, Superblock},
};
use std::{
    collections::{BTreeMap, BTreeSet},
    fs::File,
    io::{self, Read, Seek, Write},
};

/// In-memory filesystem state for a transaction session.
///
/// Holds everything needed to read and write tree blocks: the block reader
/// (with chunk cache for logical-to-physical resolution), the superblock,
/// all tree root pointers, the current transaction generation, and the set
/// of dirty (modified) block addresses.
pub struct Filesystem<R> {
    /// Block reader with fully populated chunk cache.
    reader: BlockReader<R>,
    /// Parsed superblock (updated in-memory during transactions).
    pub superblock: Superblock,
    /// Map of tree ID to root block logical address.
    roots: BTreeMap<u64, u64>,
    /// Snapshot of root bytenrs at transaction start. Used to detect which
    /// trees had their root block change during the transaction.
    original_roots: BTreeMap<u64, u64>,
    /// Logical addresses of blocks modified in the current transaction.
    /// `BTreeSet` gives sorted iteration in `flush_dirty` for sequential I/O.
    dirty: BTreeSet<u64>,
    /// Current transaction generation (superblock.generation + 1 during a
    /// transaction, or superblock.generation when idle).
    pub generation: u64,
    /// Tree block size in bytes.
    pub nodesize: u32,
    /// Minimum I/O unit in bytes.
    pub sectorsize: u32,
    /// In-memory cache of extent buffers read or created during the transaction.
    /// Keyed by logical address. This avoids re-reading blocks from disk and
    /// ensures modifications are visible within the same transaction.
    block_cache: BTreeMap<u64, ExtentBuffer>,
    /// Logical addresses of blocks that have been written to stable storage
    /// (via `flush_dirty` or `write_block`). A block in this set must be
    /// COW'd before modification even if its generation matches the current
    /// transaction, because the on-disk copy is now part of the committed
    /// state and overwriting it would break crash consistency.
    written: BTreeSet<u64>,
    /// Override for the block-group-tree id used by
    /// [`block_group_tree_id`](Self::block_group_tree_id). When `Some`,
    /// callers see this id instead of the auto-detected one. Used by
    /// the `convert-to-block-group-tree` path to pin allocator
    /// metadata to the extent tree (id 2) while the new BGT (id 11)
    /// is being built and is therefore only partially populated.
    /// Should always be cleared via [`BgTreeOverrideGuard`] (RAII)
    /// rather than written directly, so panics or early returns
    /// cannot leak the override into normal operation.
    bg_tree_override: Option<u64>,
    /// Per-device superblock `dev_item` snapshots taken at open time.
    /// The key is `devid`; the value is that device's local `dev_item`
    /// (which encodes the device's identity: `devid`, `dev_uuid`,
    /// per-device `bytes_used`, etc.). On commit, each device's
    /// superblock is rewritten with these per-device fields preserved
    /// so a multi-device filesystem doesn't get clobbered with the
    /// primary device's identity.
    per_device_dev_items: BTreeMap<u64, DeviceItem>,
}

impl<R: Read + Write + Seek> Filesystem<R> {
    /// Open a btrfs filesystem from a readable+writable+seekable handle.
    ///
    /// Performs the full bootstrap sequence (superblock, chunk cache, root
    /// tree), then wraps the result into an `Filesystem` ready for transactions.
    ///
    /// # Errors
    ///
    /// Returns an error if any I/O operation fails during bootstrap.
    pub fn open(handle: R) -> io::Result<Self> {
        let OpenFilesystem {
            reader,
            superblock,
            tree_roots,
            per_device_dev_items,
        } = reader::filesystem_open(handle)?;

        let generation = superblock.generation;
        let nodesize = superblock.nodesize;
        let sectorsize = superblock.sectorsize;

        // Convert BTreeMap<u64, (u64, u64)> to BTreeMap<u64, u64> (tree_id -> root bytenr)
        let mut roots: BTreeMap<u64, u64> = tree_roots
            .into_iter()
            .map(|(id, (bytenr, _offset))| (id, bytenr))
            .collect();

        // The root tree and chunk tree roots live in the superblock, not in
        // ROOT_ITEM entries. Add them explicitly.
        roots.insert(1, superblock.root);
        roots.insert(3, superblock.chunk_root);

        let original_roots = roots.clone();

        Ok(Self {
            reader,
            superblock,
            roots,
            original_roots,
            dirty: BTreeSet::new(),
            generation,
            nodesize,
            sectorsize,
            block_cache: BTreeMap::new(),
            written: BTreeSet::new(),
            bg_tree_override: None,
            per_device_dev_items,
        })
    }

    /// Open a btrfs filesystem using a specific superblock mirror.
    ///
    /// # Errors
    ///
    /// Returns an error if any I/O operation fails during bootstrap.
    pub fn open_mirror(handle: R, mirror: u32) -> io::Result<Self> {
        let OpenFilesystem {
            reader,
            superblock,
            tree_roots,
            per_device_dev_items,
        } = reader::filesystem_open_mirror(handle, mirror)?;

        let generation = superblock.generation;
        let nodesize = superblock.nodesize;
        let sectorsize = superblock.sectorsize;

        let mut roots: BTreeMap<u64, u64> = tree_roots
            .into_iter()
            .map(|(id, (bytenr, _offset))| (id, bytenr))
            .collect();

        roots.insert(1, superblock.root);
        roots.insert(3, superblock.chunk_root);

        let original_roots = roots.clone();

        Ok(Self {
            reader,
            superblock,
            roots,
            original_roots,
            dirty: BTreeSet::new(),
            generation,
            nodesize,
            sectorsize,
            block_cache: BTreeMap::new(),
            written: BTreeSet::new(),
            bg_tree_override: None,
            per_device_dev_items,
        })
    }

    /// Open a multi-device btrfs filesystem from a `devid -> handle` map.
    ///
    /// Every device referenced by the filesystem's chunk tree must be
    /// present in `devices`. Each device's superblock is read and
    /// validated against the map key; all devices must share the same
    /// `fsid`. The bootstrap fails with a clear error if any of these
    /// invariants is violated.
    ///
    /// # Errors
    ///
    /// Returns an error if the device map is empty, any device's
    /// superblock disagrees with its key or the primary's `fsid`, the
    /// chunk tree references a devid not in the map, or any I/O fails.
    pub fn open_multi(devices: BTreeMap<u64, R>) -> io::Result<Self> {
        let OpenFilesystem {
            reader,
            superblock,
            tree_roots,
            per_device_dev_items,
        } = reader::filesystem_open_multi(devices)?;

        let generation = superblock.generation;
        let nodesize = superblock.nodesize;
        let sectorsize = superblock.sectorsize;

        let mut roots: BTreeMap<u64, u64> = tree_roots
            .into_iter()
            .map(|(id, (bytenr, _offset))| (id, bytenr))
            .collect();

        roots.insert(1, superblock.root);
        roots.insert(3, superblock.chunk_root);

        let original_roots = roots.clone();

        Ok(Self {
            reader,
            superblock,
            roots,
            original_roots,
            dirty: BTreeSet::new(),
            generation,
            nodesize,
            sectorsize,
            block_cache: BTreeMap::new(),
            written: BTreeSet::new(),
            bg_tree_override: None,
            per_device_dev_items,
        })
    }

    /// Open a btrfs filesystem using a pre-built chunk cache.
    ///
    /// Skips the chunk tree walk entirely, using the provided cache for
    /// logical-to-physical address resolution. This is the entry point
    /// for recovery tools like `rescue chunk-recover --apply`, where the
    /// on-disk chunk tree is damaged and the cache has been reconstructed
    /// from a raw device scan.
    ///
    /// # Errors
    ///
    /// Returns an error if any I/O operation fails during bootstrap.
    pub fn open_with_chunk_cache(
        handle: R,
        mirror: u32,
        chunk_cache: ChunkTreeCache,
    ) -> io::Result<Self> {
        let OpenFilesystem {
            reader,
            superblock,
            tree_roots,
            per_device_dev_items,
        } = reader::filesystem_open_with_cache(handle, mirror, chunk_cache)?;

        let generation = superblock.generation;
        let nodesize = superblock.nodesize;
        let sectorsize = superblock.sectorsize;

        let mut roots: BTreeMap<u64, u64> = tree_roots
            .into_iter()
            .map(|(id, (bytenr, _offset))| (id, bytenr))
            .collect();

        roots.insert(1, superblock.root);
        roots.insert(3, superblock.chunk_root);

        let original_roots = roots.clone();

        Ok(Self {
            reader,
            superblock,
            roots,
            original_roots,
            dirty: BTreeSet::new(),
            generation,
            nodesize,
            sectorsize,
            block_cache: BTreeMap::new(),
            written: BTreeSet::new(),
            bg_tree_override: None,
            per_device_dev_items,
        })
    }

    /// Read a tree block at the given logical address, returning an `ExtentBuffer`.
    ///
    /// If the block is already in the in-memory cache (e.g. it was COW'd or
    /// previously read in this transaction), the cached version is returned
    /// without hitting disk.
    ///
    /// # Errors
    ///
    /// Returns an error if the block cannot be read from disk.
    pub fn read_block(&mut self, logical: u64) -> io::Result<ExtentBuffer> {
        if let Some(eb) = self.block_cache.get(&logical) {
            return Ok(eb.clone());
        }
        let data = self.reader.read_block(logical)?;
        let eb = ExtentBuffer::from_raw(data, logical);
        self.block_cache.insert(logical, eb.clone());
        Ok(eb)
    }

    /// Write an extent buffer to disk and mark it dirty.
    ///
    /// The buffer's checksum is updated before writing. The block is also
    /// stored in the in-memory cache so subsequent reads see the modification.
    ///
    /// # Errors
    ///
    /// Returns an error if the write fails.
    pub fn write_block(&mut self, eb: &mut ExtentBuffer) -> io::Result<()> {
        eb.update_checksum(self.superblock.csum_type);
        self.reader.write_block(eb.logical(), eb.as_bytes())?;
        self.dirty.insert(eb.logical());
        self.written.insert(eb.logical());
        self.block_cache.insert(eb.logical(), eb.clone());
        Ok(())
    }

    /// Store an extent buffer in the cache and mark it dirty, without writing
    /// to disk yet. The actual disk write happens at commit time.
    pub fn mark_dirty(&mut self, eb: &ExtentBuffer) {
        // Every modified block passes through here. Validate structural
        // invariants in debug builds to catch corruption early.
        debug_assert!(
            eb.check().is_ok(),
            "mark_dirty: block at {} failed validation: {}",
            eb.logical(),
            eb.check().unwrap_err(),
        );
        // The block's generation should match the current transaction.
        // A dirty block from an older generation means COW was skipped.
        debug_assert_eq!(
            eb.generation(),
            self.generation,
            "mark_dirty: block at {} has generation {} but filesystem \
             generation is {} (was COW skipped?)",
            eb.logical(),
            eb.generation(),
            self.generation,
        );
        self.dirty.insert(eb.logical());
        self.block_cache.insert(eb.logical(), eb.clone());
    }

    /// Insert a block into the in-memory cache without marking it dirty.
    ///
    /// This is for test code that needs to simulate blocks already present
    /// on disk. Production code should use [`mark_dirty`](Self::mark_dirty)
    /// (for newly modified blocks) or [`read_block`](Self::read_block)
    /// (to read from disk).
    #[doc(hidden)]
    pub fn seed_cache(&mut self, eb: &ExtentBuffer) {
        self.block_cache.insert(eb.logical(), eb.clone());
    }

    /// Return the root block logical address for the given tree ID.
    #[must_use]
    pub fn root_bytenr(&self, tree_id: u64) -> Option<u64> {
        self.roots.get(&tree_id).copied()
    }

    /// Update the root block logical address for a tree.
    pub fn set_root_bytenr(&mut self, tree_id: u64, bytenr: u64) {
        self.roots.insert(tree_id, bytenr);
    }

    /// Read the root block of the given tree as an `ExtentBuffer`.
    ///
    /// # Errors
    ///
    /// Returns an error if the tree ID is unknown or the block cannot be read.
    pub fn root_node(&mut self, tree_id: u64) -> io::Result<ExtentBuffer> {
        let bytenr = self.root_bytenr(tree_id).ok_or_else(|| {
            io::Error::new(
                io::ErrorKind::NotFound,
                format!("unknown tree ID {tree_id}"),
            )
        })?;
        self.read_block(bytenr)
    }

    /// Return an iterator over all dirty block logical addresses.
    pub fn dirty_blocks(&self) -> impl Iterator<Item = u64> + '_ {
        self.dirty.iter().copied()
    }

    /// Return the number of dirty blocks.
    #[must_use]
    pub fn dirty_count(&self) -> usize {
        self.dirty.len()
    }

    /// Check whether a block has been written to stable storage during
    /// this transaction. Such blocks must be COW'd before modification
    /// even if their generation matches the current transaction.
    #[must_use]
    pub fn is_written(&self, logical: u64) -> bool {
        self.written.contains(&logical)
    }

    /// Clear the dirty and written sets (used after commit or abort).
    pub fn clear_dirty(&mut self) {
        self.dirty.clear();
        self.written.clear();
    }

    /// Clear the block cache (used after commit or abort to free memory).
    pub fn clear_cache(&mut self) {
        self.block_cache.clear();
    }

    /// Return all tree root entries as `(tree_id, root_bytenr)` pairs.
    pub fn tree_roots(&self) -> impl Iterator<Item = (u64, u64)> + '_ {
        self.roots.iter().map(|(&id, &bytenr)| (id, bytenr))
    }

    /// Flush all dirty blocks to disk.
    ///
    /// Iterates the dirty set, checksums each cached block, and writes it.
    /// Blocks that are dirty but not in the cache are skipped (they were
    /// already written by `write_block`).
    ///
    /// # Errors
    ///
    /// Returns an error if any write fails.
    pub fn flush_dirty(&mut self) -> io::Result<()> {
        /// `BTRFS_HEADER_FLAG_WRITTEN` (bit 0): the kernel requires this
        /// flag on all tree blocks that have been committed to stable
        /// storage. Must be set before computing the checksum.
        const HEADER_FLAG_WRITTEN: u64 = 1 << 0;

        let dirty: Vec<u64> = self.dirty.iter().copied().collect();
        for logical in dirty {
            if let Some(eb) = self.block_cache.get(&logical).cloned() {
                let mut eb = eb;
                // Validate before writing to stable storage. This is
                // the last chance to catch corruption before it lands
                // on disk.
                debug_assert!(
                    eb.check().is_ok(),
                    "flush_dirty: block at {} failed validation before \
                     write: {}",
                    eb.logical(),
                    eb.check().unwrap_err(),
                );
                eb.set_flags(eb.flags() | HEADER_FLAG_WRITTEN);
                eb.update_checksum(self.superblock.csum_type);
                self.reader.write_block(eb.logical(), eb.as_bytes())?;
                self.written.insert(eb.logical());
            }
        }
        Ok(())
    }

    /// Return a mutable reference to the underlying block reader.
    pub fn reader_mut(&mut self) -> &mut BlockReader<R> {
        &mut self.reader
    }

    /// Return a reference to the underlying block reader.
    #[must_use]
    pub fn reader(&self) -> &BlockReader<R> {
        &self.reader
    }

    /// Remove a tree root entry.
    pub fn remove_root(&mut self, tree_id: u64) -> Option<u64> {
        self.roots.remove(&tree_id)
    }

    /// Evict a block from the cache (e.g. after freeing it).
    pub fn evict_block(&mut self, logical: u64) {
        self.block_cache.remove(&logical);
        self.dirty.remove(&logical);
    }

    /// Snapshot the current roots so we can detect changes at commit time.
    ///
    /// Called at transaction start to record the baseline state.
    pub fn snapshot_roots(&mut self) {
        self.original_roots = self.roots.clone();
    }

    /// Restore the roots map to the last snapshot. Used by
    /// `Transaction::abort` to roll back in-memory `set_root_bytenr`
    /// changes that pointed at COWed-but-never-written bytenrs.
    pub fn restore_roots_from_snapshot(&mut self) {
        self.roots = self.original_roots.clone();
    }

    /// Write the in-memory superblock to all 3 mirrors of every open
    /// device, with each device's per-device `dev_item` spliced in.
    ///
    /// On a multi-device filesystem each device's superblock has its
    /// own `dev_item` (`devid`, `dev_uuid`, per-device `bytes_used`, etc.).
    /// Writing the primary device's superblock verbatim to a secondary
    /// would corrupt the secondary's identity. This helper preserves
    /// that per-device state by splicing the matching `dev_item` from
    /// the snapshot taken at open time before serializing each
    /// device's variant.
    ///
    /// # Errors
    ///
    /// Returns an error if any device referenced by the dev-item
    /// snapshot is not open, or if any underlying write fails.
    pub fn write_superblock_all_devices(&mut self) -> io::Result<()> {
        // Collect the devids first so we don't hold a borrow on
        // `self.reader` while iterating per-device serializations.
        let devids: Vec<u64> = self.reader.devices().keys().copied().collect();
        for devid in devids {
            let mut sb_for_dev = self.superblock.clone();
            if let Some(dev_item) = self.per_device_dev_items.get(&devid) {
                sb_for_dev.dev_item = dev_item.clone();
            }
            let bytes = sb_for_dev.to_bytes();
            let dev = self
                .reader
                .devices_mut()
                .get_mut(&devid)
                .expect("devid present in iteration but missing from map");
            superblock::write_superblock_all_mirrors(dev, &bytes)?;
        }
        Ok(())
    }

    /// Flush pending writes via `Write::flush()` on every device.
    ///
    /// Flushes userspace write buffers on every open device. For
    /// file-backed storage, use [`Filesystem<File>::sync`] instead,
    /// which also calls fsync per device.
    pub fn flush_writes(&mut self) -> io::Result<()> {
        for dev in self.reader.devices_mut().values_mut() {
            dev.flush()?;
        }
        Ok(())
    }

    /// Return tree IDs whose root block changed since the last snapshot.
    ///
    /// Compares current roots against the snapshot taken at transaction start.
    /// Excludes tree IDs 1 (root tree) and 3 (chunk tree) since their root
    /// pointers live in the superblock, not in root items.
    #[must_use]
    pub fn changed_roots(&self) -> Vec<(u64, u64, u8)> {
        let mut changed = Vec::new();
        for (&tree_id, &current_bytenr) in &self.roots {
            // Root tree and chunk tree are updated via superblock, not root items
            if tree_id == 1 || tree_id == 3 {
                continue;
            }
            let original = self.original_roots.get(&tree_id).copied();
            if original != Some(current_bytenr) {
                // Look up the level from the cached block if available
                let level = self
                    .block_cache
                    .get(&current_bytenr)
                    .map_or(0, ExtentBuffer::level);
                changed.push((tree_id, current_bytenr, level));
            }
        }
        changed
    }

    /// Return the tree id that holds `BLOCK_GROUP_ITEM` records.
    ///
    /// When [`bg_tree_override`](Self::bg_tree_override_for_test) is
    /// set (typically by the `convert-to-block-group-tree` path),
    /// returns it verbatim. Otherwise auto-detects: returns 11
    /// (`BLOCK_GROUP_TREE`) if a root for tree 11 is registered,
    /// else 2 (`EXTENT_TREE`).
    ///
    /// All allocator and block-group-update code paths must consult
    /// this accessor instead of duplicating the routing logic, so
    /// that the override mechanism actually works for everything
    /// that touches block-group state.
    #[must_use]
    pub fn block_group_tree_id(&self) -> u64 {
        if let Some(id) = self.bg_tree_override {
            return id;
        }
        if self.root_bytenr(11).is_some() {
            11
        } else {
            2
        }
    }

    /// Set the block-group-tree id override. Prefer
    /// [`pin_block_group_tree`](Self::pin_block_group_tree) which
    /// returns an RAII guard that clears the override on drop.
    ///
    /// Exposed primarily for unit tests of the routing primitive.
    #[doc(hidden)]
    pub fn bg_tree_override_for_test(&mut self, id: Option<u64>) {
        self.bg_tree_override = id;
    }

    /// Pin [`block_group_tree_id`](Self::block_group_tree_id) to
    /// the given tree id and return a guard that restores the
    /// previous override (typically `None`) when dropped.
    ///
    /// Use this in conversion paths so that panics or `?`
    /// early-returns cannot leave the override stuck on the wrong
    /// value.
    pub fn pin_block_group_tree(
        &mut self,
        id: u64,
    ) -> BgTreeOverrideGuard<'_, R> {
        let prev = self.bg_tree_override;
        self.bg_tree_override = Some(id);
        BgTreeOverrideGuard { fs: self, prev }
    }
}

/// RAII guard that restores the previous block-group-tree
/// override on drop. Created by
/// [`Filesystem::pin_block_group_tree`].
pub struct BgTreeOverrideGuard<'a, R> {
    fs: &'a mut Filesystem<R>,
    prev: Option<u64>,
}

impl<R> BgTreeOverrideGuard<'_, R> {
    /// Borrow the underlying filesystem mutably for the duration
    /// of the guard.
    pub fn fs_mut(&mut self) -> &mut Filesystem<R> {
        self.fs
    }
}

impl<R> Drop for BgTreeOverrideGuard<'_, R> {
    fn drop(&mut self) {
        self.fs.bg_tree_override = self.prev;
    }
}

impl Filesystem<File> {
    /// Sync all data to stable storage on every device (fsync).
    ///
    /// Calls `File::sync_all()` on every open device handle, ensuring
    /// all written data reaches stable storage. This should be called
    /// after commit to guarantee durability.
    pub fn sync(&mut self) -> io::Result<()> {
        for dev in self.reader.devices_mut().values_mut() {
            dev.sync_all()?;
        }
        Ok(())
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // Filesystem requires a real filesystem image to test meaningfully.
    // These are basic structural tests; full integration tests will use
    // temporary images created by btrfs-mkfs.

    #[test]
    fn dirty_tracking() {
        // We can test the dirty set logic without a real filesystem
        let mut dirty = BTreeSet::new();
        dirty.insert(65536u64);
        dirty.insert(131072);
        assert_eq!(dirty.len(), 2);
        assert!(dirty.contains(&65536));
        dirty.clear();
        assert!(dirty.is_empty());
    }

    #[test]
    fn roots_map() {
        let mut roots = BTreeMap::new();
        roots.insert(1u64, 65536u64);
        roots.insert(5, 131072);
        assert_eq!(roots.get(&1), Some(&65536));
        assert_eq!(roots.get(&5), Some(&131072));
        assert_eq!(roots.get(&99), None);
    }
}