Skip to main content

btrfs_disk/
reader.rs

1//! # Block device reader with logical-to-physical address resolution
2//!
3//! Provides `BlockReader` which reads btrfs tree blocks by logical address,
4//! resolving them through the chunk tree cache. Also provides `filesystem_open`
5//! which bootstraps a complete `BlockReader` from a raw block device or image.
6
7use crate::{
8    chunk::{
9        self, ChunkTreeCache, ParityKind, ParityPlan, ParityRow,
10        StripePlacement, WritePlan,
11    },
12    raid56, raw,
13    superblock::{self, Superblock},
14    tree::{KeyType, TreeBlock},
15};
16use bytes::Buf;
17use std::{
18    collections::BTreeMap,
19    io::{self, Read, Seek, SeekFrom, Write},
20    mem,
21};
22
23/// A block reader that resolves logical addresses through a chunk cache.
24///
25/// Holds one I/O handle per device, keyed by `devid`. For single-device
26/// filesystems the map has a single entry. For RAID1 / RAID1C3 / RAID1C4
27/// / RAID10 / DUP, each stripe's `devid` (from the chunk cache) is used
28/// to look up the handle. SINGLE and DUP work with a one-entry map.
29pub struct BlockReader<R> {
30    devices: BTreeMap<u64, R>,
31    nodesize: u32,
32    chunk_cache: ChunkTreeCache,
33}
34
35impl<R> BlockReader<R> {
36    /// Create a single-device block reader.
37    ///
38    /// `devid` is the device id (`superblock.dev_item.devid`) under which
39    /// this handle is registered. Stripe lookups for this device must
40    /// resolve to this devid.
41    pub fn new(
42        handle: R,
43        devid: u64,
44        nodesize: u32,
45        chunk_cache: ChunkTreeCache,
46    ) -> Self {
47        let mut devices = BTreeMap::new();
48        devices.insert(devid, handle);
49        Self {
50            devices,
51            nodesize,
52            chunk_cache,
53        }
54    }
55
56    /// Create a multi-device block reader.
57    ///
58    /// `devices` maps each device id to its I/O handle. Every devid
59    /// referenced by the chunk cache must be present.
60    #[must_use]
61    pub fn new_multi(
62        devices: BTreeMap<u64, R>,
63        nodesize: u32,
64        chunk_cache: ChunkTreeCache,
65    ) -> Self {
66        Self {
67            devices,
68            nodesize,
69            chunk_cache,
70        }
71    }
72
73    /// Return the per-devid handle map.
74    #[must_use]
75    pub fn devices(&self) -> &BTreeMap<u64, R> {
76        &self.devices
77    }
78
79    /// Return the per-devid handle map mutably.
80    ///
81    /// Used by transaction commit / sync / flush paths that need to
82    /// flush every device. For ordinary reads/writes prefer
83    /// [`read_block`](Self::read_block), [`read_data`](Self::read_data),
84    /// or [`write_block`](Self::write_block) which route by devid via
85    /// the chunk cache.
86    pub fn devices_mut(&mut self) -> &mut BTreeMap<u64, R> {
87        &mut self.devices
88    }
89
90    /// Return the underlying handle for a single-device filesystem.
91    ///
92    /// Convenience for offline tools (`btrfs-tune`,
93    /// `btrfs filesystem resize` on a regular file) that operate on
94    /// one device at a time and need raw file access (e.g. `set_len`
95    /// or full-file scans).
96    ///
97    /// # Panics
98    ///
99    /// Panics if more than one device is open. Multi-device callers
100    /// must use [`devices_mut`](Self::devices_mut) and route by
101    /// devid explicitly.
102    pub fn single_device_mut(&mut self) -> &mut R {
103        assert_eq!(
104            self.devices.len(),
105            1,
106            "single_device_mut: filesystem has {} devices, not 1",
107            self.devices.len(),
108        );
109        self.devices.values_mut().next().unwrap()
110    }
111}
112
113impl<R: Read + Seek> BlockReader<R> {
114    /// Read raw bytes at a logical address, resolving to physical via the chunk cache.
115    ///
116    /// # Errors
117    ///
118    /// Returns an error if the logical address is unmapped, the resolved
119    /// device id is not in the handle map, or the underlying read fails.
120    pub fn read_block(&mut self, logical: u64) -> io::Result<Vec<u8>> {
121        // Tree blocks are always nodesize <= stripe_len, so a single
122        // block lives entirely in one row. Use plan_read so striped
123        // profiles (RAID0 / RAID10 / RAID5 / RAID6) route to the
124        // correct device column; mirrored/SINGLE chunks return one
125        // placement on stripes[0] either way.
126        self.read_data(logical, self.nodesize as usize)
127    }
128
129    /// Read and parse a tree block at a logical address.
130    ///
131    /// # Errors
132    ///
133    /// Returns an error if the logical address is unmapped or the underlying read fails.
134    pub fn read_tree_block(&mut self, logical: u64) -> io::Result<TreeBlock> {
135        let buf = self.read_block(logical)?;
136        Ok(TreeBlock::parse(&buf))
137    }
138
139    /// Return a reference to the chunk cache.
140    #[must_use]
141    pub fn chunk_cache(&self) -> &ChunkTreeCache {
142        &self.chunk_cache
143    }
144
145    /// Return a mutable reference to the chunk cache.
146    pub fn chunk_cache_mut(&mut self) -> &mut ChunkTreeCache {
147        &mut self.chunk_cache
148    }
149
150    /// Return the nodesize.
151    #[must_use]
152    pub fn nodesize(&self) -> u32 {
153        self.nodesize
154    }
155
156    /// Read arbitrary data at a logical address (not limited to nodesize).
157    ///
158    /// Unlike `read_block` which always reads `nodesize` bytes, this reads
159    /// exactly `len` bytes. Used for reading file data extents.
160    ///
161    /// Uses [`ChunkTreeCache::plan_read`](crate::chunk::ChunkTreeCache::plan_read)
162    /// internally so reads on striped profiles (RAID0 / RAID10 /
163    /// RAID5 / RAID6) that span multiple rows assemble the bytes from
164    /// the correct devices in order. RAID5/RAID6 reads route to the
165    /// data column owning each row's bytes, ignoring parity (degraded
166    /// reads are out of scope).
167    ///
168    /// # Errors
169    ///
170    /// Returns an error if the logical address is unmapped, the request
171    /// extends past the chunk, or the underlying read fails.
172    pub fn read_data(
173        &mut self,
174        logical: u64,
175        len: usize,
176    ) -> io::Result<Vec<u8>> {
177        let placements =
178            self.chunk_cache.plan_read(logical, len).ok_or_else(|| {
179                io::Error::new(
180                    io::ErrorKind::NotFound,
181                    format!(
182                        "logical address {logical} not mapped or unsupported profile"
183                    ),
184                )
185            })?;
186        let mut buf = vec![0u8; len];
187        for p in placements {
188            let dev = self.device_handle_mut(p.devid)?;
189            dev.seek(SeekFrom::Start(p.physical))?;
190            dev.read_exact(&mut buf[p.buf_offset..p.buf_offset + p.len])?;
191        }
192        Ok(buf)
193    }
194
195    /// Look up a device handle by `devid`. Returns a clear error if the
196    /// chunk cache references a device that was not opened.
197    fn device_handle_mut(&mut self, devid: u64) -> io::Result<&mut R> {
198        self.devices.get_mut(&devid).ok_or_else(|| {
199            io::Error::new(
200                io::ErrorKind::NotFound,
201                format!(
202                    "device {devid} not open (referenced by the chunk cache)"
203                ),
204            )
205        })
206    }
207}
208
209impl<R: Read + Write + Seek> BlockReader<R> {
210    /// Write raw bytes to a logical address, routing to the correct
211    /// per-device locations based on the chunk's RAID profile.
212    ///
213    /// Uses [`ChunkTreeCache::plan_write`](crate::chunk::ChunkTreeCache::plan_write)
214    /// internally. For DUP / RAID1 / RAID1C3 / RAID1C4 every mirror
215    /// receives the same bytes; for RAID0 each row goes to exactly one
216    /// device; for RAID10 each row goes to one mirror pair; for
217    /// RAID5/RAID6 the executor prereads every data column slot of
218    /// every touched row, mixes in caller bytes, computes parity, then
219    /// writes data + parity. Writes larger than `stripe_len` on a
220    /// striped profile are split into per-row segments automatically.
221    ///
222    /// # Errors
223    ///
224    /// Returns an error if the logical address is unmapped, the
225    /// request extends past the chunk, any referenced device is not
226    /// open, or any underlying read/write fails.
227    pub fn write_block(&mut self, logical: u64, buf: &[u8]) -> io::Result<()> {
228        let plan = self.chunk_cache.plan_write(logical, buf.len()).ok_or_else(
229            || {
230                io::Error::new(
231                    io::ErrorKind::NotFound,
232                    format!(
233                        "logical address {logical} not mapped or unsupported profile"
234                    ),
235                )
236            },
237        )?;
238        match plan {
239            WritePlan::Plain(placements) => {
240                self.execute_plain_plan(buf, &placements)
241            }
242            WritePlan::Parity(plan) => self.execute_parity_plan(buf, &plan),
243        }
244    }
245
246    fn execute_plain_plan(
247        &mut self,
248        buf: &[u8],
249        placements: &[StripePlacement],
250    ) -> io::Result<()> {
251        for p in placements {
252            let dev = self.device_handle_mut(p.devid)?;
253            dev.seek(SeekFrom::Start(p.physical))?;
254            dev.write_all(&buf[p.buf_offset..p.buf_offset + p.len])?;
255        }
256        Ok(())
257    }
258
259    /// Run the read-modify-write cycle for a RAID5/RAID6 write.
260    ///
261    /// For every row in the plan: preread each data column slot into a
262    /// scratch buffer, overlay any caller bytes, compute P (and Q for
263    /// RAID6) over the assembled scratches, then issue the data writes
264    /// (the overlaid byte ranges only) and the parity writes (full
265    /// `stripe_len` slots).
266    fn execute_parity_plan(
267        &mut self,
268        buf: &[u8],
269        plan: &ParityPlan,
270    ) -> io::Result<()> {
271        let stripe_len = plan.stripe_len as usize;
272        for row in &plan.rows {
273            self.execute_parity_row(buf, stripe_len, row)?;
274        }
275        Ok(())
276    }
277
278    fn execute_parity_row(
279        &mut self,
280        buf: &[u8],
281        stripe_len: usize,
282        row: &ParityRow,
283    ) -> io::Result<()> {
284        // Preread every data column slot of the row into its own scratch.
285        let mut scratch: Vec<Vec<u8>> =
286            Vec::with_capacity(row.data_columns.len());
287        for col in &row.data_columns {
288            let mut slot = vec![0u8; stripe_len];
289            let dev = self.device_handle_mut(col.devid)?;
290            dev.seek(SeekFrom::Start(col.physical))?;
291            dev.read_exact(&mut slot)?;
292            scratch.push(slot);
293        }
294        // Overlay caller bytes onto each touched slot.
295        for (col, slot) in row.data_columns.iter().zip(scratch.iter_mut()) {
296            if let Some(ov) = &col.overlay {
297                let dst_start = ov.slot_offset as usize;
298                let dst_end = dst_start + ov.len as usize;
299                let src_start = ov.buf_offset;
300                let src_end = src_start + ov.len as usize;
301                slot[dst_start..dst_end]
302                    .copy_from_slice(&buf[src_start..src_end]);
303            }
304        }
305        // Compute parity from the assembled (post-overlay) data slots.
306        let stripe_refs: Vec<&[u8]> =
307            scratch.iter().map(Vec::as_slice).collect();
308        let want_q = row.parity_targets.iter().any(|t| t.kind == ParityKind::Q);
309        let (p_buf, q_buf) = if want_q {
310            let (p, q) = raid56::compute_p_q(&stripe_refs);
311            (p, Some(q))
312        } else {
313            (raid56::compute_p(&stripe_refs), None)
314        };
315        // Issue the data writes (overlaid byte ranges only).
316        for col in &row.data_columns {
317            let Some(ov) = &col.overlay else { continue };
318            let dev = self.device_handle_mut(col.devid)?;
319            dev.seek(SeekFrom::Start(
320                col.physical + u64::from(ov.slot_offset),
321            ))?;
322            let src_start = ov.buf_offset;
323            let src_end = src_start + ov.len as usize;
324            dev.write_all(&buf[src_start..src_end])?;
325        }
326        // Issue the parity writes (full stripe_len slots).
327        for target in &row.parity_targets {
328            let bytes = match target.kind {
329                ParityKind::P => &p_buf,
330                ParityKind::Q => {
331                    q_buf.as_ref().expect("Q target without Q computation")
332                }
333            };
334            let dev = self.device_handle_mut(target.devid)?;
335            dev.seek(SeekFrom::Start(target.physical))?;
336            dev.write_all(bytes)?;
337        }
338        Ok(())
339    }
340}
341
342/// Result of opening a btrfs filesystem from a block device or image.
343pub struct OpenFilesystem<R> {
344    /// Block reader with fully populated chunk cache.
345    pub reader: BlockReader<R>,
346    /// Parsed primary-device superblock.
347    pub superblock: Superblock,
348    /// Map of tree ID -> (root block logical address, key offset), from the root tree.
349    pub tree_roots: BTreeMap<u64, (u64, u64)>,
350    /// Per-device `dev_item` snapshots taken at open time. One entry
351    /// per opened device (always at least the primary). The transaction
352    /// crate uses these to splice the correct per-device identity into
353    /// the superblock when writing it back during commit, so a
354    /// multi-device filesystem doesn't get clobbered with the primary
355    /// device's `dev_item`.
356    pub per_device_dev_items: BTreeMap<u64, crate::items::DeviceItem>,
357}
358
359/// Open a btrfs filesystem by bootstrapping from the superblock.
360///
361/// This performs the full bootstrap sequence:
362/// 1. Read the superblock (mirror 0)
363/// 2. Seed the chunk cache from the `sys_chunk_array`
364/// 3. Read the full chunk tree to complete the cache
365/// 4. Read the root tree to collect all tree root pointers
366///
367/// # Errors
368///
369/// Returns an error if any I/O operation fails during bootstrap.
370pub fn filesystem_open<R: Read + Seek>(
371    reader: R,
372) -> io::Result<OpenFilesystem<R>> {
373    filesystem_open_mirror(reader, 0)
374}
375
376/// Open a btrfs filesystem using a specific superblock mirror (0, 1, or 2).
377///
378/// # Errors
379///
380/// Returns an error if any I/O operation fails during bootstrap.
381pub fn filesystem_open_mirror<R: Read + Seek>(
382    reader: R,
383    mirror: u32,
384) -> io::Result<OpenFilesystem<R>> {
385    let mut reader = reader;
386
387    // Step 1: read the superblock
388    let sb = superblock::read_superblock(&mut reader, mirror)?;
389    let primary_devid = sb.dev_item.devid;
390    let mut per_device_dev_items = BTreeMap::new();
391    per_device_dev_items.insert(primary_devid, sb.dev_item.clone());
392
393    // Step 2: seed chunk cache from sys_chunk_array
394    let chunk_cache = chunk::seed_from_sys_chunk_array(
395        &sb.sys_chunk_array,
396        sb.sys_chunk_array_size,
397    );
398
399    let mut block_reader =
400        BlockReader::new(reader, primary_devid, sb.nodesize, chunk_cache);
401
402    // Step 3: read the full chunk tree to complete the cache
403    read_chunk_tree(&mut block_reader, sb.chunk_root)?;
404
405    // Step 4: read the root tree to collect tree roots
406    let tree_roots = read_root_tree(&mut block_reader, sb.root)?;
407
408    Ok(OpenFilesystem {
409        reader: block_reader,
410        superblock: sb,
411        tree_roots,
412        per_device_dev_items,
413    })
414}
415
416/// Open a multi-device btrfs filesystem from a map of `devid -> handle`.
417///
418/// All devices in the filesystem must be present in the map; the
419/// bootstrap fails with a clear error if the chunk tree references a
420/// devid that is not in the map. Each device's superblock is read and
421/// its `dev_item.devid` is verified against the map key, and all
422/// devices' `fsid` must match.
423///
424/// The "primary" superblock used for filesystem-wide fields (root,
425/// `chunk_root`, generation, etc.) is the one with the lowest devid.
426///
427/// # Errors
428///
429/// Returns an error if any superblock cannot be read, any device's
430/// superblock disagrees with its map key or with the primary's fsid,
431/// or the chunk tree references a devid not in the map.
432///
433/// # Panics
434///
435/// Panics if the in-memory device map is empty after the initial
436/// non-empty check (an internal invariant violation, not a runtime
437/// possibility).
438pub fn filesystem_open_multi<R: Read + Seek>(
439    devices: BTreeMap<u64, R>,
440) -> io::Result<OpenFilesystem<R>> {
441    if devices.is_empty() {
442        return Err(io::Error::other(
443            "filesystem_open_multi: device map is empty",
444        ));
445    }
446    let mut devices = devices;
447
448    // Step 1: read each device's superblock and validate identity.
449    let mut per_device_dev_items: BTreeMap<u64, crate::items::DeviceItem> =
450        BTreeMap::new();
451    let mut superblocks: BTreeMap<u64, Superblock> = BTreeMap::new();
452    for (&devid, dev) in &mut devices {
453        let sb = superblock::read_superblock(dev, 0)?;
454        if sb.dev_item.devid != devid {
455            return Err(io::Error::other(format!(
456                "device map key {devid} doesn't match superblock dev_item.devid {}",
457                sb.dev_item.devid,
458            )));
459        }
460        per_device_dev_items.insert(devid, sb.dev_item.clone());
461        superblocks.insert(devid, sb);
462    }
463
464    // Step 2: pick the lowest-devid superblock as authoritative for
465    // filesystem-wide fields, and validate fsid consistency.
466    let primary_sb = superblocks.values().next().unwrap().clone();
467    for (devid, sb) in &superblocks {
468        if sb.fsid != primary_sb.fsid {
469            return Err(io::Error::other(format!(
470                "device {devid} fsid {} differs from primary fsid {}",
471                sb.fsid, primary_sb.fsid,
472            )));
473        }
474    }
475
476    // Step 3: seed chunk cache from primary's sys_chunk_array.
477    let chunk_cache = chunk::seed_from_sys_chunk_array(
478        &primary_sb.sys_chunk_array,
479        primary_sb.sys_chunk_array_size,
480    );
481
482    let mut block_reader =
483        BlockReader::new_multi(devices, primary_sb.nodesize, chunk_cache);
484
485    // Step 4: read the full chunk tree to complete the cache.
486    read_chunk_tree(&mut block_reader, primary_sb.chunk_root)?;
487
488    // Step 5: validate every devid the chunk cache references is open.
489    let mut referenced: std::collections::BTreeSet<u64> =
490        std::collections::BTreeSet::new();
491    for mapping in block_reader.chunk_cache().iter() {
492        for stripe in &mapping.stripes {
493            referenced.insert(stripe.devid);
494        }
495    }
496    for devid in &referenced {
497        if !block_reader.devices().contains_key(devid) {
498            return Err(io::Error::other(format!(
499                "chunk tree references devid {devid} but no handle was provided"
500            )));
501        }
502    }
503
504    // Step 6: read the root tree.
505    let tree_roots = read_root_tree(&mut block_reader, primary_sb.root)?;
506
507    Ok(OpenFilesystem {
508        reader: block_reader,
509        superblock: primary_sb,
510        tree_roots,
511        per_device_dev_items,
512    })
513}
514
515/// Open a btrfs filesystem using a pre-built chunk cache.
516///
517/// Skips the chunk tree walk entirely, using the provided cache for
518/// all logical-to-physical address resolution. This is the entry point
519/// for `rescue chunk-recover --apply`, where the on-disk chunk tree is
520/// damaged and the cache has been reconstructed from a raw device scan.
521///
522/// The root tree is still read normally (it becomes accessible once the
523/// correct chunk mappings are in place).
524///
525/// # Errors
526///
527/// Returns an error if the superblock read or root tree walk fails.
528pub fn filesystem_open_with_cache<R: Read + Seek>(
529    reader: R,
530    mirror: u32,
531    chunk_cache: ChunkTreeCache,
532) -> io::Result<OpenFilesystem<R>> {
533    let mut reader = reader;
534    let sb = superblock::read_superblock(&mut reader, mirror)?;
535    let primary_devid = sb.dev_item.devid;
536    let mut per_device_dev_items = BTreeMap::new();
537    per_device_dev_items.insert(primary_devid, sb.dev_item.clone());
538    let mut block_reader =
539        BlockReader::new(reader, primary_devid, sb.nodesize, chunk_cache);
540    let tree_roots = read_root_tree(&mut block_reader, sb.root)?;
541
542    Ok(OpenFilesystem {
543        reader: block_reader,
544        superblock: sb,
545        tree_roots,
546        per_device_dev_items,
547    })
548}
549
550/// Recursively read the chunk tree to populate the chunk cache.
551///
552/// Starting from the given root block, walks all leaves and inserts any
553/// `CHUNK_ITEM` entries that are not already present in the cache.
554///
555/// # Errors
556///
557/// Returns an error if any tree block read fails.
558pub fn read_chunk_tree<R: Read + Seek>(
559    reader: &mut BlockReader<R>,
560    root_logical: u64,
561) -> io::Result<()> {
562    let block = reader.read_tree_block(root_logical)?;
563
564    match &block {
565        TreeBlock::Leaf { items, data, .. } => {
566            for item in items {
567                if item.key.key_type != KeyType::ChunkItem {
568                    continue;
569                }
570                let item_data = &data[mem::size_of::<raw::btrfs_header>()
571                    + item.offset as usize..];
572                if let Some((mapping, _)) =
573                    chunk::parse_chunk_item(item_data, item.key.offset)
574                {
575                    // Only insert if not already in cache (sys_chunk_array may
576                    // have already seeded some entries)
577                    if reader.chunk_cache.lookup(mapping.logical).is_none() {
578                        reader.chunk_cache.insert(mapping);
579                    }
580                }
581            }
582        }
583        TreeBlock::Node { ptrs, .. } => {
584            for ptr in ptrs {
585                read_chunk_tree(reader, ptr.blockptr)?;
586            }
587        }
588    }
589
590    Ok(())
591}
592
593/// Read the root tree to collect all tree root pointers.
594///
595/// Returns a map of `tree_id` (objectid) -> `(root_bytenr, key_offset)`.
596///
597/// # Errors
598///
599/// Returns an error if any tree block read fails.
600pub fn read_root_tree<R: Read + Seek>(
601    reader: &mut BlockReader<R>,
602    root_logical: u64,
603) -> io::Result<BTreeMap<u64, (u64, u64)>> {
604    let mut tree_roots = BTreeMap::new();
605    collect_root_items(reader, root_logical, &mut tree_roots)?;
606    Ok(tree_roots)
607}
608
609/// Tree traversal order.
610#[derive(Debug, Clone, Copy, PartialEq, Eq)]
611pub enum Traversal {
612    /// Breadth-first: print all nodes at level N, then N-1, down to leaves.
613    Bfs,
614    /// Depth-first: print a node, then recursively its children.
615    Dfs,
616}
617
618/// Walk a tree starting at `root_logical`, calling `visitor` for each block.
619///
620/// # Errors
621///
622/// Returns an error if any tree block cannot be read.
623pub fn tree_walk<R: Read + Seek>(
624    reader: &mut BlockReader<R>,
625    root_logical: u64,
626    traversal: Traversal,
627    visitor: &mut dyn FnMut(&TreeBlock),
628) -> io::Result<()> {
629    match traversal {
630        Traversal::Bfs => tree_walk_bfs(reader, root_logical, visitor),
631        Traversal::Dfs => tree_walk_dfs(reader, root_logical, visitor),
632    }
633}
634
635fn tree_walk_dfs<R: Read + Seek>(
636    reader: &mut BlockReader<R>,
637    logical: u64,
638    visitor: &mut dyn FnMut(&TreeBlock),
639) -> io::Result<()> {
640    let block = reader.read_tree_block(logical)?;
641    visitor(&block);
642
643    if let TreeBlock::Node { ptrs, .. } = &block {
644        for ptr in ptrs {
645            tree_walk_dfs(reader, ptr.blockptr, visitor)?;
646        }
647    }
648
649    Ok(())
650}
651
652fn tree_walk_bfs<R: Read + Seek>(
653    reader: &mut BlockReader<R>,
654    root_logical: u64,
655    visitor: &mut dyn FnMut(&TreeBlock),
656) -> io::Result<()> {
657    let root_block = reader.read_tree_block(root_logical)?;
658    let root_level = root_block.header().level;
659    visitor(&root_block);
660
661    let mut current_level_ptrs: Vec<u64> = match &root_block {
662        TreeBlock::Node { ptrs, .. } => {
663            ptrs.iter().map(|p| p.blockptr).collect()
664        }
665        TreeBlock::Leaf { .. } => return Ok(()),
666    };
667
668    for _level in (0..root_level).rev() {
669        let mut next_level_ptrs = Vec::new();
670
671        for logical in &current_level_ptrs {
672            let block = reader.read_tree_block(*logical)?;
673            visitor(&block);
674
675            if let TreeBlock::Node { ptrs, .. } = &block {
676                next_level_ptrs.extend(ptrs.iter().map(|p| p.blockptr));
677            }
678        }
679
680        current_level_ptrs = next_level_ptrs;
681    }
682
683    Ok(())
684}
685
686/// Walk a tree, continuing past individual block read errors.
687///
688/// Unlike [`tree_walk`], this does not stop when a child block cannot be read.
689/// Instead, it calls `on_error` with the logical address and the I/O error,
690/// then continues with remaining siblings. The root block failure still
691/// propagates since there is nothing to walk.
692///
693/// # Errors
694///
695/// Returns an error only if the root block itself cannot be read.
696pub fn tree_walk_tolerant<R: Read + Seek>(
697    reader: &mut BlockReader<R>,
698    root_logical: u64,
699    visitor: &mut dyn FnMut(&[u8], &TreeBlock),
700    on_error: &mut dyn FnMut(u64, &io::Error),
701) -> io::Result<()> {
702    let buf = reader.read_block(root_logical)?;
703    let block = TreeBlock::parse(&buf);
704    visitor(&buf, &block);
705
706    if let TreeBlock::Node { ptrs, .. } = &block {
707        for ptr in ptrs {
708            tree_walk_tolerant_dfs(reader, ptr.blockptr, visitor, on_error);
709        }
710    }
711
712    Ok(())
713}
714
715fn tree_walk_tolerant_dfs<R: Read + Seek>(
716    reader: &mut BlockReader<R>,
717    logical: u64,
718    visitor: &mut dyn FnMut(&[u8], &TreeBlock),
719    on_error: &mut dyn FnMut(u64, &io::Error),
720) {
721    let buf = match reader.read_block(logical) {
722        Ok(b) => b,
723        Err(e) => {
724            on_error(logical, &e);
725            return;
726        }
727    };
728    let block = TreeBlock::parse(&buf);
729    visitor(&buf, &block);
730
731    if let TreeBlock::Node { ptrs, .. } = &block {
732        for ptr in ptrs {
733            tree_walk_tolerant_dfs(reader, ptr.blockptr, visitor, on_error);
734        }
735    }
736}
737
738/// Walk a tree (DFS), allowing the visitor to modify each block in place.
739///
740/// The visitor receives the raw block buffer and the parsed `TreeBlock`. If it
741/// returns `true`, the block is re-checksummed using `csum_type` and written
742/// back to disk. This is used by operations that need to patch tree block
743/// headers or items (e.g. fsid rewrite, repair).
744///
745/// # Errors
746///
747/// Returns an error if the root block cannot be read or any write fails.
748pub fn tree_walk_mut<R: Read + Write + Seek>(
749    reader: &mut BlockReader<R>,
750    root_logical: u64,
751    csum_type: superblock::ChecksumType,
752    visitor: &mut dyn FnMut(&mut Vec<u8>, &TreeBlock) -> bool,
753) -> io::Result<()> {
754    let mut buf = reader.read_block(root_logical)?;
755    let block = TreeBlock::parse(&buf);
756
757    let child_ptrs: Vec<u64> = if let TreeBlock::Node { ptrs, .. } = &block {
758        ptrs.iter().map(|p| p.blockptr).collect()
759    } else {
760        Vec::new()
761    };
762
763    if visitor(&mut buf, &block) {
764        crate::util::csum_tree_block(&mut buf, csum_type);
765        reader.write_block(root_logical, &buf)?;
766    }
767
768    for ptr in child_ptrs {
769        tree_walk_mut(reader, ptr, csum_type, visitor)?;
770    }
771
772    Ok(())
773}
774
775/// Read a single block and call `visitor` (and optionally walk children with `follow`).
776///
777/// # Errors
778///
779/// Returns an error if any tree block cannot be read.
780pub fn block_visit<R: Read + Seek>(
781    reader: &mut BlockReader<R>,
782    logical: u64,
783    follow: bool,
784    traversal: Traversal,
785    visitor: &mut dyn FnMut(&TreeBlock),
786) -> io::Result<()> {
787    if follow {
788        tree_walk(reader, logical, traversal, visitor)
789    } else {
790        let block = reader.read_tree_block(logical)?;
791        visitor(&block);
792        Ok(())
793    }
794}
795
796/// Statistics collected by walking all blocks of a single B-tree.
797#[derive(Debug, Clone)]
798pub struct TreeStats {
799    /// Total number of tree blocks (nodes and leaves).
800    pub total_nodes: u64,
801    /// Total bytes occupied by tree blocks (`total_nodes × nodesize`).
802    pub total_bytes: u64,
803    /// Total bytes of inline file data (non-zero only when `find_inline` is true).
804    pub total_inline: u64,
805    /// Number of non-contiguous jumps between sibling block addresses.
806    pub total_seeks: u64,
807    /// Seeks where the next sibling is at a higher address.
808    pub forward_seeks: u64,
809    /// Seeks where the next sibling is at a lower address.
810    pub backward_seeks: u64,
811    /// Sum of all seek distances in bytes.
812    pub total_seek_len: u64,
813    /// Largest single seek distance in bytes.
814    pub max_seek_len: u64,
815    /// Number of contiguous block runs (clusters) counted between seeks.
816    pub total_clusters: u64,
817    /// Sum of all cluster sizes in bytes.
818    pub total_cluster_size: u64,
819    /// Smallest cluster size in bytes (`u64::MAX` if no seeks occurred).
820    pub min_cluster_size: u64,
821    /// Largest cluster size in bytes (initialised to `nodesize`).
822    pub max_cluster_size: u64,
823    /// Lowest block bytenr seen during the walk.
824    pub lowest_bytenr: u64,
825    /// Highest block bytenr seen during the walk.
826    pub highest_bytenr: u64,
827    /// Number of blocks at each level: index 0 = leaves, higher = internal.
828    pub node_counts: Vec<u64>,
829    /// Tree height (number of levels, root level + 1).
830    pub levels: u8,
831}
832
833/// Walk `root_logical` collecting [`TreeStats`].
834///
835/// When `find_inline` is true the walk also counts inline extent data bytes
836/// (relevant for subvolume / FS trees which contain `EXTENT_DATA` items).
837///
838/// # Errors
839///
840/// Returns an error if any tree block cannot be read.
841pub fn tree_stats_collect<R: Read + Seek>(
842    reader: &mut BlockReader<R>,
843    root_logical: u64,
844    find_inline: bool,
845) -> io::Result<TreeStats> {
846    let root_block = reader.read_tree_block(root_logical)?;
847    let nodesize = u64::from(reader.nodesize());
848    let root_level = root_block.header().level;
849    let root_bytenr = root_block.header().bytenr;
850
851    let mut stats = TreeStats {
852        total_nodes: 0,
853        total_bytes: 0,
854        total_inline: 0,
855        total_seeks: 0,
856        forward_seeks: 0,
857        backward_seeks: 0,
858        total_seek_len: 0,
859        max_seek_len: 0,
860        total_clusters: 0,
861        total_cluster_size: 0,
862        min_cluster_size: u64::MAX,
863        max_cluster_size: nodesize,
864        lowest_bytenr: root_bytenr,
865        highest_bytenr: root_bytenr,
866        node_counts: vec![0u64; root_level as usize + 1],
867        levels: root_level + 1,
868    };
869
870    walk_stats(reader, root_block, &mut stats, find_inline, nodesize)?;
871    Ok(stats)
872}
873
874/// Recursively walk a tree block, accumulating stats.
875fn walk_stats<R: Read + Seek>(
876    reader: &mut BlockReader<R>,
877    block: TreeBlock,
878    stats: &mut TreeStats,
879    find_inline: bool,
880    nodesize: u64,
881) -> io::Result<()> {
882    let level = block.header().level;
883    let bytenr = block.header().bytenr;
884
885    stats.total_nodes += 1;
886    stats.total_bytes += nodesize;
887    if (level as usize) < stats.node_counts.len() {
888        stats.node_counts[level as usize] += 1;
889    }
890    if bytenr < stats.lowest_bytenr {
891        stats.lowest_bytenr = bytenr;
892    }
893    if bytenr > stats.highest_bytenr {
894        stats.highest_bytenr = bytenr;
895    }
896
897    match block {
898        TreeBlock::Leaf { items, data, .. } => {
899            if find_inline {
900                let type_off =
901                    mem::offset_of!(raw::btrfs_file_extent_item, type_);
902                let inline_hdr_size =
903                    mem::offset_of!(raw::btrfs_file_extent_item, disk_bytenr);
904                let header_size = mem::size_of::<raw::btrfs_header>();
905                for item in &items {
906                    if item.key.key_type != KeyType::ExtentData {
907                        continue;
908                    }
909                    let start = header_size + item.offset as usize;
910                    if start + type_off >= data.len() {
911                        continue;
912                    }
913                    // BTRFS_FILE_EXTENT_INLINE == 0
914                    if data[start + type_off] == 0
915                        && item.size as usize > inline_hdr_size
916                    {
917                        stats.total_inline +=
918                            u64::from(item.size) - inline_hdr_size as u64;
919                    }
920                }
921            }
922        }
923        TreeBlock::Node { ptrs, .. } => {
924            let mut last_block = bytenr;
925            let mut cluster_size = nodesize;
926
927            for ptr in ptrs {
928                let child = reader.read_tree_block(ptr.blockptr)?;
929                walk_stats(reader, child, stats, find_inline, nodesize)?;
930
931                let cur = ptr.blockptr;
932                if last_block + nodesize == cur {
933                    cluster_size += nodesize;
934                } else {
935                    let distance = cur.abs_diff(last_block + nodesize);
936                    stats.total_seeks += 1;
937                    stats.total_seek_len += distance;
938                    if distance > stats.max_seek_len {
939                        stats.max_seek_len = distance;
940                    }
941                    if cur > last_block + nodesize {
942                        stats.forward_seeks += 1;
943                    } else {
944                        stats.backward_seeks += 1;
945                    }
946                    if cluster_size != nodesize {
947                        stats.total_clusters += 1;
948                        stats.total_cluster_size += cluster_size;
949                        if cluster_size < stats.min_cluster_size {
950                            stats.min_cluster_size = cluster_size;
951                        }
952                        if cluster_size > stats.max_cluster_size {
953                            stats.max_cluster_size = cluster_size;
954                        }
955                    }
956                    cluster_size = nodesize;
957                }
958                last_block = cur;
959            }
960        }
961    }
962
963    Ok(())
964}
965
966fn collect_root_items<R: Read + Seek>(
967    reader: &mut BlockReader<R>,
968    logical: u64,
969    tree_roots: &mut BTreeMap<u64, (u64, u64)>,
970) -> io::Result<()> {
971    let block = reader.read_tree_block(logical)?;
972
973    match &block {
974        TreeBlock::Leaf { items, data, .. } => {
975            // ROOT_ITEM contains btrfs_root_item; the bytenr field gives
976            // the root block of that tree.
977            let header_size = mem::size_of::<raw::btrfs_header>();
978            let root_item_bytenr_offset = {
979                // bytenr is after inode (160 bytes) + generation (8) + root_dirid (8)
980                // = offset 176 in btrfs_root_item
981                mem::offset_of!(raw::btrfs_root_item, bytenr)
982            };
983
984            for item in items {
985                if item.key.key_type != KeyType::RootItem {
986                    continue;
987                }
988                let item_start = header_size + item.offset as usize;
989                if item_start + root_item_bytenr_offset + 8 > data.len() {
990                    continue;
991                }
992                let mut buf = &data[item_start + root_item_bytenr_offset..];
993                let bytenr = buf.get_u64_le();
994                if bytenr != 0 {
995                    tree_roots
996                        .insert(item.key.objectid, (bytenr, item.key.offset));
997                }
998            }
999        }
1000        TreeBlock::Node { ptrs, .. } => {
1001            for ptr in ptrs {
1002                collect_root_items(reader, ptr.blockptr, tree_roots)?;
1003            }
1004        }
1005    }
1006
1007    Ok(())
1008}
1009
1010#[cfg(test)]
1011mod tests {
1012    use super::*;
1013    use crate::chunk::{ChunkMapping, Stripe};
1014    use std::io::Cursor;
1015    use uuid::Uuid;
1016
1017    /// Build a per-device cursor backed by a `len`-byte zero-filled
1018    /// `Vec<u8>` for use as a fake device.
1019    fn make_device(len: usize) -> Cursor<Vec<u8>> {
1020        Cursor::new(vec![0u8; len])
1021    }
1022
1023    /// Build a chunk mapping with arbitrary stripes. Each entry is
1024    /// `(devid, physical_offset)`. The profile defaults to SINGLE for
1025    /// one-stripe mappings and RAID1 for multi-stripe (matches what
1026    /// the old `resolve_all`-based tests assumed).
1027    fn make_mapping(
1028        logical: u64,
1029        length: u64,
1030        stripes: &[(u64, u64)],
1031    ) -> ChunkMapping {
1032        let chunk_type = if stripes.len() == 1 {
1033            0 // SINGLE
1034        } else {
1035            // Pick a profile that fans out to all stripes regardless of
1036            // count: DUP when stripes share a devid, otherwise
1037            // RAID1 / RAID1C3 / RAID1C4 by mirror count.
1038            let same_devid = stripes.iter().all(|s| s.0 == stripes[0].0);
1039            if same_devid {
1040                u64::from(raw::BTRFS_BLOCK_GROUP_DUP)
1041            } else {
1042                u64::from(match stripes.len() {
1043                    3 => raw::BTRFS_BLOCK_GROUP_RAID1C3,
1044                    4 => raw::BTRFS_BLOCK_GROUP_RAID1C4,
1045                    // Default: RAID1 (covers 2 stripes and any
1046                    // unexpected count — every stripe gets the bytes).
1047                    _ => raw::BTRFS_BLOCK_GROUP_RAID1,
1048                })
1049            }
1050        };
1051        ChunkMapping {
1052            logical,
1053            length,
1054            stripe_len: 65536,
1055            chunk_type,
1056            num_stripes: stripes.len() as u16,
1057            sub_stripes: 0,
1058            stripes: stripes
1059                .iter()
1060                .map(|&(devid, offset)| Stripe {
1061                    devid,
1062                    offset,
1063                    dev_uuid: Uuid::nil(),
1064                })
1065                .collect(),
1066        }
1067    }
1068
1069    /// Build a `BlockReader` over the supplied per-devid cursors with
1070    /// the given chunk cache contents and a 4 KiB nodesize.
1071    fn make_reader(
1072        devices: &[(u64, usize)],
1073        mappings: &[ChunkMapping],
1074    ) -> BlockReader<Cursor<Vec<u8>>> {
1075        let mut handles = BTreeMap::new();
1076        for &(devid, len) in devices {
1077            handles.insert(devid, make_device(len));
1078        }
1079        let mut cache = ChunkTreeCache::default();
1080        for m in mappings {
1081            cache.insert(m.clone());
1082        }
1083        BlockReader::new_multi(handles, 4096, cache)
1084    }
1085
1086    #[test]
1087    fn read_block_routes_to_correct_devid() {
1088        // Two devices, each carrying distinguishable bytes at the
1089        // physical offset that the chunk mapping resolves the logical
1090        // address to. resolve picks stripe[0]; we put devid=2 first
1091        // in the stripe list to verify routing follows that, not the
1092        // numerically-lowest devid.
1093        let mapping =
1094            make_mapping(1_000_000, 4096, &[(2, 50_000), (1, 20_000)]);
1095        let mut reader = make_reader(&[(1, 100_000), (2, 100_000)], &[mapping]);
1096
1097        // Seed each device's cursor with a recognizable byte at the
1098        // physical offset we'll route to.
1099        reader.devices_mut().get_mut(&1).unwrap().get_mut()
1100            [20_000..20_000 + 4096]
1101            .fill(0xAA);
1102        reader.devices_mut().get_mut(&2).unwrap().get_mut()
1103            [50_000..50_000 + 4096]
1104            .fill(0xBB);
1105
1106        // resolve picks stripe[0] = (devid=2, phys=50_000), so the
1107        // read returns 0xBB bytes.
1108        let buf = reader.read_block(1_000_000).expect("read_block");
1109        assert_eq!(buf.len(), 4096);
1110        assert!(buf.iter().all(|&b| b == 0xBB), "expected all 0xBB");
1111    }
1112
1113    #[test]
1114    fn read_data_routes_to_correct_devid() {
1115        let mapping = make_mapping(1_000_000, 4096, &[(5, 8000)]);
1116        let mut reader = make_reader(&[(5, 100_000)], &[mapping]);
1117        reader.devices_mut().get_mut(&5).unwrap().get_mut()[8000..8000 + 100]
1118            .fill(0xCC);
1119
1120        let buf = reader.read_data(1_000_000, 100).expect("read_data");
1121        assert_eq!(buf, vec![0xCC; 100]);
1122    }
1123
1124    #[test]
1125    fn write_block_fans_out_to_all_stripes() {
1126        // RAID1: 2 devices, write_block must update both at the
1127        // resolved physical offsets, leaving everything else zero.
1128        let mapping = make_mapping(2_000_000, 4096, &[(1, 1000), (2, 7000)]);
1129        let mut reader = make_reader(&[(1, 100_000), (2, 100_000)], &[mapping]);
1130
1131        let payload = vec![0xDDu8; 4096];
1132        reader
1133            .write_block(2_000_000, &payload)
1134            .expect("write_block");
1135
1136        let dev1 = reader.devices().get(&1).unwrap().get_ref();
1137        let dev2 = reader.devices().get(&2).unwrap().get_ref();
1138        assert_eq!(&dev1[1000..1000 + 4096], &payload[..]);
1139        assert_eq!(&dev2[7000..7000 + 4096], &payload[..]);
1140        // Untouched regions still zero on both devices.
1141        assert!(dev1[..1000].iter().all(|&b| b == 0));
1142        assert!(dev1[1000 + 4096..].iter().all(|&b| b == 0));
1143        assert!(dev2[..7000].iter().all(|&b| b == 0));
1144        assert!(dev2[7000 + 4096..].iter().all(|&b| b == 0));
1145    }
1146
1147    #[test]
1148    fn write_block_fans_out_to_dup_same_devid() {
1149        // DUP profile: both copies on the same device, at distinct
1150        // physical offsets. write_block must hit both.
1151        let mapping = make_mapping(3_000_000, 4096, &[(1, 1000), (1, 50_000)]);
1152        let mut reader = make_reader(&[(1, 100_000)], &[mapping]);
1153
1154        let payload = vec![0xEEu8; 4096];
1155        reader
1156            .write_block(3_000_000, &payload)
1157            .expect("write_block");
1158
1159        let dev = reader.devices().get(&1).unwrap().get_ref();
1160        assert_eq!(&dev[1000..1000 + 4096], &payload[..]);
1161        assert_eq!(&dev[50_000..50_000 + 4096], &payload[..]);
1162    }
1163
1164    #[test]
1165    fn write_block_three_devices_raid1c3() {
1166        let mapping = make_mapping(4_000_000, 4096, &[(1, 0), (2, 0), (3, 0)]);
1167        let mut reader =
1168            make_reader(&[(1, 8192), (2, 8192), (3, 8192)], &[mapping]);
1169
1170        let payload = vec![0xFFu8; 4096];
1171        reader
1172            .write_block(4_000_000, &payload)
1173            .expect("write_block");
1174
1175        for &devid in &[1u64, 2, 3] {
1176            let dev = reader.devices().get(&devid).unwrap().get_ref();
1177            assert_eq!(
1178                &dev[..4096],
1179                &payload[..],
1180                "devid {devid} mirror missing"
1181            );
1182        }
1183    }
1184
1185    #[test]
1186    fn read_block_missing_devid_errors() {
1187        // Chunk cache references devid 9, but the handle map only has
1188        // devids 1 and 2. Reads must surface a clear error rather than
1189        // panicking or silently mis-routing.
1190        let mapping = make_mapping(5_000_000, 4096, &[(9, 0)]);
1191        let mut reader = make_reader(&[(1, 8192), (2, 8192)], &[mapping]);
1192
1193        let err = reader.read_block(5_000_000).unwrap_err();
1194        assert_eq!(err.kind(), io::ErrorKind::NotFound);
1195        assert!(
1196            err.to_string().contains("device 9"),
1197            "expected error to mention devid 9, got: {err}"
1198        );
1199    }
1200
1201    #[test]
1202    fn write_block_missing_devid_errors() {
1203        let mapping = make_mapping(5_000_000, 4096, &[(1, 0), (9, 0)]);
1204        let mut reader = make_reader(&[(1, 8192)], &[mapping]);
1205
1206        let err = reader.write_block(5_000_000, &[0u8; 4096]).unwrap_err();
1207        assert_eq!(err.kind(), io::ErrorKind::NotFound);
1208        assert!(err.to_string().contains("device 9"));
1209    }
1210
1211    #[test]
1212    fn read_block_unmapped_logical_errors() {
1213        // No chunk for this logical address.
1214        let mut reader = make_reader(&[(1, 8192)], &[]);
1215        let err = reader.read_block(1_000_000).unwrap_err();
1216        assert_eq!(err.kind(), io::ErrorKind::NotFound);
1217        assert!(err.to_string().contains("not mapped"));
1218    }
1219
1220    #[test]
1221    fn new_single_inserts_under_supplied_devid() {
1222        // BlockReader::new wraps the handle under the explicit devid
1223        // so multi-device callers can mix it with other handles later.
1224        let cursor = make_device(8192);
1225        let cache = ChunkTreeCache::default();
1226        let reader = BlockReader::new(cursor, 7, 4096, cache);
1227        assert_eq!(reader.devices().len(), 1);
1228        assert!(reader.devices().contains_key(&7));
1229    }
1230
1231    #[test]
1232    fn new_multi_with_disjoint_devids() {
1233        // Sparse devid map (1 and 5 only — devid 3 was removed).
1234        // Both reads and writes should route correctly.
1235        let mapping = make_mapping(0, 4096, &[(1, 100), (5, 200)]);
1236        let mut reader = make_reader(&[(1, 8192), (5, 8192)], &[mapping]);
1237
1238        let payload = vec![0x77u8; 4096];
1239        reader.write_block(0, &payload).expect("write_block");
1240        let dev1 = reader.devices().get(&1).unwrap().get_ref();
1241        let dev5 = reader.devices().get(&5).unwrap().get_ref();
1242        assert_eq!(&dev1[100..100 + 4096], &payload[..]);
1243        assert_eq!(&dev5[200..200 + 4096], &payload[..]);
1244    }
1245
1246    #[test]
1247    #[should_panic(expected = "filesystem has 2 devices")]
1248    fn single_device_mut_panics_on_multi_device() {
1249        let mapping = make_mapping(0, 4096, &[(1, 0), (2, 0)]);
1250        let mut reader = make_reader(&[(1, 4096), (2, 4096)], &[mapping]);
1251        let _ = reader.single_device_mut();
1252    }
1253
1254    #[test]
1255    fn single_device_mut_returns_handle_for_single_device() {
1256        let mapping = make_mapping(0, 4096, &[(1, 0)]);
1257        let mut reader = make_reader(&[(1, 8192)], &[mapping]);
1258        // Verify it doesn't panic and we can use the returned handle.
1259        // Write a marker byte and confirm via the map view.
1260        reader.single_device_mut().get_mut()[42] = 0x99;
1261        assert_eq!(reader.devices().get(&1).unwrap().get_ref()[42], 0x99);
1262    }
1263}