Skip to main content

btrfs_fs/
filesystem.rs

1//! The [`Filesystem`] type and its operation methods.
2//!
3//! [`Filesystem`] is `Clone` (cheap `Arc` bump) and exposes all
4//! operations as `async fn`. Internally each op runs the (currently
5//! sync) I/O work inside [`tokio::task::spawn_blocking`] so the
6//! async runtime is never blocked. Future work (a native async I/O
7//! backend, lock-free cache hits) is internal and won't change the
8//! API.
9//!
10//! Embedders must call these methods inside a tokio runtime context.
11//! `btrfs-fuse` provides one; tests use `#[tokio::test]`; other
12//! embedders bring their own.
13
14use crate::{
15    CacheConfig, Entry, FileKind, Stat,
16    cache::{ExtentMapCache, InodeCache, LruTreeBlockCache},
17    dir, read,
18    stat::to_system_time,
19    xattr,
20};
21use btrfs_disk::{
22    items::{
23        DeviceItem, DirItem, FileExtentBody, InodeExtref, InodeItem, InodeRef,
24        RootItem, RootItemFlags, RootRef, Timespec as DiskTimespec,
25    },
26    reader::{BlockReader, Traversal, filesystem_open, tree_walk},
27    superblock::Superblock,
28    tree::{KeyType, TreeBlock},
29};
30use btrfs_stream::{StreamCommand, StreamWriter, Timespec as StreamTimespec};
31use std::{
32    collections::BTreeMap,
33    io, mem,
34    sync::{Arc, Mutex, MutexGuard},
35    time::SystemTime,
36};
37use uuid::Uuid;
38
39/// `BTRFS_FS_TREE_OBJECTID` — the default subvolume's tree root.
40const FS_TREE_OBJECTID: u64 = 5;
41
42/// `BTRFS_FIRST_FREE_OBJECTID` — the root directory of any subvolume,
43/// and the lower bound for non-default subvolume IDs.
44const ROOT_DIR_OBJECTID: u64 = 256;
45
46/// `BTRFS_LAST_FREE_OBJECTID` — upper bound of the user-subvolume id
47/// range. Anything above is reserved for system trees (UUID, etc.).
48const LAST_FREE_OBJECTID: u64 = u64::MAX - 256;
49
50/// Stream version emitted by [`Filesystem::send`]. v1 is the
51/// conservative pick — every byte goes through plain `WRITE`, no
52/// encoded passthrough, no clone refs. Maximum compatibility with
53/// receivers in the wild.
54const SEND_STREAM_VERSION: u32 = 1;
55
56/// Maximum bytes per `WRITE` command on the v1 stream. The TLV
57/// length field is `u16`, so a strict upper bound is 65 535 bytes;
58/// we leave headroom for the path/offset attributes plus the
59/// framed-command overhead. 48 KiB is what the kernel uses.
60const SEND_WRITE_CHUNK_BYTES: usize = 48 * 1024;
61
62/// Whether `id` names a subvolume tree (the default `FS_TREE` plus
63/// the user-allocatable range). Used to filter system trees out of
64/// [`Filesystem::list_subvolumes`] and to validate `open_subvol`.
65fn is_subvolume_id(id: u64) -> bool {
66    id == FS_TREE_OBJECTID
67        || (ROOT_DIR_OBJECTID..=LAST_FREE_OBJECTID).contains(&id)
68}
69
70/// Identifier for a subvolume tree (the tree's root objectid).
71///
72/// For the default subvolume this is `5` (`BTRFS_FS_TREE_OBJECTID`).
73/// Custom subvolumes use `256` and up.
74#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash)]
75pub struct SubvolId(pub u64);
76
77/// A filesystem-level inode reference: the subvolume it lives in plus
78/// the on-disk objectid within that subvolume.
79#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
80pub struct Inode {
81    pub subvol: SubvolId,
82    pub ino: u64,
83}
84
85/// Metadata for a single subvolume, returned by
86/// [`Filesystem::list_subvolumes`] and
87/// [`Filesystem::get_subvol_info`].
88///
89/// Marked `#[non_exhaustive]` so additional fields can be added in
90/// the future without breaking pattern destructuring at call sites.
91#[derive(Debug, Clone)]
92#[non_exhaustive]
93pub struct SubvolInfo {
94    /// The subvolume's tree id.
95    pub id: SubvolId,
96    /// Parent subvolume, or `None` for the default `FS_TREE`
97    /// (`SubvolId(5)`) — it has no `ROOT_BACKREF` because nothing
98    /// contains it.
99    pub parent: Option<SubvolId>,
100    /// Path component of this subvolume within its parent. Empty
101    /// for the default `FS_TREE`.
102    pub name: Vec<u8>,
103    /// Inode number of the directory in `parent` that holds this
104    /// subvolume entry. Zero for top-level subvolumes (no parent).
105    pub dirid: u64,
106    /// Read-only flag (set on snapshots taken with `-r`, or `--ro`
107    /// on `mkfs --subvol`).
108    pub readonly: bool,
109    /// Last modification time of the subvolume's `ROOT_ITEM`.
110    pub ctime: SystemTime,
111    /// Creation time.
112    pub otime: SystemTime,
113    /// Generation when the subvolume was created or last modified.
114    pub generation: u64,
115    /// Transaction id of the last `ROOT_ITEM` update.
116    pub ctransid: u64,
117    /// Transaction id when the subvolume was created.
118    pub otransid: u64,
119    /// UUID of this subvolume.
120    pub uuid: Uuid,
121    /// UUID of the parent subvolume (for snapshots). All zeros for
122    /// non-snapshot subvolumes.
123    pub parent_uuid: Uuid,
124    /// UUID of the subvolume this was received from (for `btrfs
125    /// receive`). All zeros for non-received subvolumes.
126    pub received_uuid: Uuid,
127}
128
129/// Compound-key range filter for [`Filesystem::tree_search`]. Mirrors
130/// the kernel's `btrfs_ioctl_search_key` semantics: items are returned
131/// where `(min_objectid, min_type, min_offset) <= (key) <= (max_objectid,
132/// max_type, max_offset)` treated as a single 17-byte compound key, AND
133/// the leaf's generation falls in `[min_transid, max_transid]`.
134#[derive(Debug, Clone, Copy)]
135pub struct SearchFilter {
136    /// Tree to search (e.g. `1` for root tree, `5` for default FS tree,
137    /// or any subvolume id).
138    pub tree_id: u64,
139    pub min_objectid: u64,
140    pub max_objectid: u64,
141    pub min_type: u32,
142    pub max_type: u32,
143    pub min_offset: u64,
144    pub max_offset: u64,
145    pub min_transid: u64,
146    pub max_transid: u64,
147    /// Maximum items to return.
148    pub max_items: u32,
149}
150
151/// One item returned by [`Filesystem::tree_search`]. `transid` is the
152/// leaf's generation (matching the kernel ioctl's
153/// `btrfs_ioctl_search_header.transid`).
154#[derive(Debug, Clone)]
155pub struct SearchItem {
156    pub transid: u64,
157    pub objectid: u64,
158    pub item_type: u32,
159    pub offset: u64,
160    pub data: Vec<u8>,
161}
162
163/// Whence value for [`Filesystem::seek_hole_data`]. Maps to the
164/// POSIX `SEEK_HOLE` / `SEEK_DATA` whence constants used by
165/// `lseek(2)`.
166#[derive(Debug, Clone, Copy, PartialEq, Eq)]
167pub enum SeekHoleData {
168    /// `SEEK_DATA`: return the offset of the start of the next data
169    /// region at or after the given offset.
170    Data,
171    /// `SEEK_HOLE`: return the offset of the start of the next hole
172    /// at or after the given offset. Always succeeds within the
173    /// file because EOF is treated as a virtual hole.
174    Hole,
175}
176
177/// Filesystem-wide statistics, returned by [`Filesystem::statfs`].
178#[derive(Debug, Clone, Copy, PartialEq, Eq)]
179pub struct StatFs {
180    /// Total blocks (in units of `bsize`).
181    pub blocks: u64,
182    /// Free blocks (in units of `bsize`).
183    pub bfree: u64,
184    /// Blocks available to unprivileged users (same as `bfree` for btrfs).
185    pub bavail: u64,
186    /// Preferred block size.
187    pub bsize: u32,
188    /// Maximum filename length.
189    pub namelen: u32,
190    /// Fragment size (same as `bsize` for btrfs).
191    pub frsize: u32,
192}
193
194/// High-level read-only btrfs filesystem.
195///
196/// `Filesystem` is a cheap-to-clone handle (`Arc` internally) and all
197/// operations are `async fn`, so multiple tokio tasks can drive the
198/// same filesystem concurrently. I/O still serialises on a single
199/// internal mutex (held only inside `spawn_blocking`); a future phase
200/// can swap that for a reader pool without changing the public API.
201pub struct Filesystem<R: io::Read + io::Seek + Send + 'static> {
202    inner: Arc<Inner<R>>,
203}
204
205impl<R: io::Read + io::Seek + Send + 'static> Clone for Filesystem<R> {
206    fn clone(&self) -> Self {
207        Self {
208            inner: Arc::clone(&self.inner),
209        }
210    }
211}
212
213/// Shared state behind every [`Filesystem`] handle.
214struct Inner<R: io::Read + io::Seek + Send + 'static> {
215    /// Parsed primary-device superblock.
216    superblock: Superblock,
217    /// Map of tree id → (root block logical address, key offset).
218    /// Multi-subvolume support will look up additional entries here.
219    tree_roots: BTreeMap<u64, (u64, u64)>,
220    /// Cached objectid of the default subvolume.
221    default_subvol: SubvolId,
222    /// Filesystem sectorsize, forwarded from the superblock.
223    blksize: u32,
224    /// I/O backend. Held only inside `spawn_blocking`, never across an
225    /// `.await`. Future work can swap this for a pool of readers
226    /// without changing the public `&self` API.
227    reader: Mutex<BlockReader<R>>,
228    /// Concrete reference to the tree-block cache. Stored alongside
229    /// the `Arc<dyn TreeBlockCache>` inside the reader so callers can
230    /// inspect [`crate::CacheStats`] without going through the trait.
231    tree_block_cache: Arc<LruTreeBlockCache>,
232    /// Inode cache. Hit on `getattr`, `lookup`, `readlink`, `read`.
233    /// Populated whenever an `InodeItem` is parsed from the tree.
234    inode_cache: InodeCache,
235    /// Per-inode extent map cache. Built lazily on first `read` of a
236    /// file; reused for subsequent reads of the same inode.
237    extent_map_cache: ExtentMapCache,
238}
239
240impl<R: io::Read + io::Seek + Send + 'static> Filesystem<R> {
241    /// Bootstrap the filesystem from a reader over an image or block
242    /// device, with default cache sizes.
243    ///
244    /// This is sync because the heavy work happens during the bootstrap
245    /// (chunk tree walk, root tree walk) and only runs once. Embedders
246    /// that want non-blocking open can wrap the call in
247    /// `tokio::task::spawn_blocking` themselves.
248    pub fn open(reader: R) -> io::Result<Self> {
249        Self::open_inner(
250            reader,
251            SubvolId(FS_TREE_OBJECTID),
252            CacheConfig::default(),
253        )
254    }
255
256    /// Bootstrap the filesystem and select a non-default subvolume
257    /// as the [`Filesystem::root`], with default cache sizes.
258    ///
259    /// `subvol` must be the tree id of an existing subvolume — pass
260    /// the value from a previously-listed [`SubvolInfo::id`], or use
261    /// `SubvolId(5)` to get the default. Errors with `NotFound` if
262    /// the id is unknown, `InvalidInput` if it's outside the
263    /// subvolume id range.
264    pub fn open_subvol(reader: R, subvol: SubvolId) -> io::Result<Self> {
265        Self::open_inner(reader, subvol, CacheConfig::default())
266    }
267
268    /// Like [`Filesystem::open`] but with caller-chosen cache sizes.
269    /// Use [`CacheConfig::no_cache`] for benchmarking the cold path
270    /// or memory-constrained embedders; otherwise tune the
271    /// individual entries to match your workload.
272    pub fn open_with_caches(
273        reader: R,
274        cache_config: CacheConfig,
275    ) -> io::Result<Self> {
276        Self::open_inner(reader, SubvolId(FS_TREE_OBJECTID), cache_config)
277    }
278
279    /// Like [`Filesystem::open_subvol`] but with caller-chosen cache
280    /// sizes.
281    pub fn open_subvol_with_caches(
282        reader: R,
283        subvol: SubvolId,
284        cache_config: CacheConfig,
285    ) -> io::Result<Self> {
286        Self::open_inner(reader, subvol, cache_config)
287    }
288
289    fn open_inner(
290        reader: R,
291        default_subvol: SubvolId,
292        cache_config: CacheConfig,
293    ) -> io::Result<Self> {
294        if !is_subvolume_id(default_subvol.0) {
295            return Err(io::Error::new(
296                io::ErrorKind::InvalidInput,
297                format!(
298                    "{} is not a valid subvolume id (must be 5 or in \
299                     [256, u64::MAX - 256])",
300                    default_subvol.0,
301                ),
302            ));
303        }
304        let mut fs = filesystem_open(reader)
305            .map_err(|e| io::Error::other(e.to_string()))?;
306        let blksize = fs.superblock.sectorsize;
307        if !fs.tree_roots.contains_key(&FS_TREE_OBJECTID) {
308            return Err(io::Error::other(
309                "default FS tree (objectid 5) not found in root tree",
310            ));
311        }
312        if !fs.tree_roots.contains_key(&default_subvol.0) {
313            return Err(io::Error::new(
314                io::ErrorKind::NotFound,
315                format!("subvolume {} not found", default_subvol.0),
316            ));
317        }
318        // Attach a tree-block cache to the underlying reader so every
319        // tree walk past the first benefits transparently. We keep
320        // both a typed `Arc<LruTreeBlockCache>` (for stats) and the
321        // erased `Arc<dyn TreeBlockCache>` (for the reader) — they
322        // point at the same instance.
323        let tree_block_cache =
324            Arc::new(LruTreeBlockCache::new(cache_config.tree_blocks));
325        fs.reader.set_cache(Some(tree_block_cache.clone()
326            as Arc<dyn btrfs_disk::reader::TreeBlockCache>));
327        Ok(Self {
328            inner: Arc::new(Inner {
329                superblock: fs.superblock,
330                tree_roots: fs.tree_roots,
331                default_subvol,
332                blksize,
333                reader: Mutex::new(fs.reader),
334                tree_block_cache,
335                inode_cache: InodeCache::new(cache_config.inodes),
336                extent_map_cache: ExtentMapCache::new(cache_config.extent_maps),
337            }),
338        })
339    }
340
341    /// Snapshot of the tree-block cache hit/miss counters. Useful for
342    /// tests, benchmarks, and embedders surfacing cache metrics.
343    #[must_use]
344    pub fn tree_block_cache_stats(&self) -> crate::CacheStats {
345        self.inner.tree_block_cache.stats()
346    }
347
348    /// Inode of the default subvolume's root directory (objectid 256).
349    #[must_use]
350    pub fn root(&self) -> Inode {
351        Inode {
352            subvol: self.inner.default_subvol,
353            ino: ROOT_DIR_OBJECTID,
354        }
355    }
356
357    /// The subvolume `Filesystem` was opened against (the default
358    /// `FS_TREE` unless [`Filesystem::open_subvol`] was used).
359    #[must_use]
360    pub fn default_subvol(&self) -> SubvolId {
361        self.inner.default_subvol
362    }
363
364    /// Enumerate every subvolume on the filesystem.
365    ///
366    /// Walks the root tree, parsing `ROOT_ITEM` and `ROOT_BACKREF`
367    /// entries to build a [`SubvolInfo`] per subvolume. The default
368    /// `FS_TREE` (`SubvolId(5)`) is included with `parent: None` and
369    /// an empty `name`.
370    pub async fn list_subvolumes(&self) -> io::Result<Vec<SubvolInfo>> {
371        let this = self.clone();
372        spawn_blocking(move || this.list_subvolumes_blocking()).await
373    }
374
375    /// Fetch metadata for a single subvolume.
376    ///
377    /// Currently implemented as a filtered [`Filesystem::list_subvolumes`]
378    /// call — the root-tree walk dominates either way, so a one-shot
379    /// lookup wouldn't be faster than the cached/single-pass list.
380    /// Returns `Ok(None)` for an unknown id.
381    pub async fn get_subvol_info(
382        &self,
383        id: SubvolId,
384    ) -> io::Result<Option<SubvolInfo>> {
385        Ok(self
386            .list_subvolumes()
387            .await?
388            .into_iter()
389            .find(|s| s.id == id))
390    }
391
392    /// Read-only access to the parsed primary-device superblock.
393    /// Used by ioctl handlers (`FS_INFO`, `GET_FEATURES`, etc.) and
394    /// by embedders that need to inspect format-level fields without
395    /// re-parsing.
396    #[must_use]
397    pub fn superblock(&self) -> &Superblock {
398        &self.inner.superblock
399    }
400
401    /// Return the [`DeviceItem`] for `devid`, or `None` if no such
402    /// device exists on this filesystem.
403    ///
404    /// Currently single-device-only: returns the primary device's
405    /// embedded `dev_item` from the superblock when `devid == 1`,
406    /// `None` otherwise. Multi-device support would walk the dev
407    /// tree; landing alongside multi-device write support.
408    #[must_use]
409    pub fn dev_info(&self, devid: u64) -> Option<DeviceItem> {
410        if self.inner.superblock.dev_item.devid == devid {
411            Some(self.inner.superblock.dev_item.clone())
412        } else {
413            None
414        }
415    }
416
417    /// Run a tree search matching the kernel's `BTRFS_IOC_TREE_SEARCH_V2`
418    /// semantics. Returns at most `filter.max_items` items, stopping
419    /// early if `max_buf_size` (the userspace buffer cap, including
420    /// per-item 32-byte headers) would be exceeded by adding the next
421    /// item.
422    ///
423    /// Note: the kernel runs the search against any tree by id; we
424    /// only resolve subvolume trees and the root tree (`tree_id == 1`).
425    /// Searches against the chunk/extent/csum/etc. trees would need
426    /// additional plumbing — they're not exposed today.
427    pub async fn tree_search(
428        &self,
429        filter: SearchFilter,
430        max_buf_size: usize,
431    ) -> io::Result<Vec<SearchItem>> {
432        let this = self.clone();
433        spawn_blocking(move || this.tree_search_blocking(filter, max_buf_size))
434            .await
435    }
436
437    /// Resolve `objectid` in `subvol` to its slash-separated path
438    /// from the subvolume root.
439    ///
440    /// Walks the `INODE_REF` chain upwards from `objectid` until it
441    /// reaches the subvolume root (objectid 256). For directories
442    /// the kernel `INO_LOOKUP` ioctl returns the path with a
443    /// trailing `/`; this helper does NOT add one — the caller can
444    /// append if it needs to mimic that exactly.
445    ///
446    /// Returns `Ok(None)` if any step in the chain has no
447    /// `INODE_REF` (orphaned inode, or wrong subvol).
448    pub async fn ino_lookup(
449        &self,
450        subvol: SubvolId,
451        objectid: u64,
452    ) -> io::Result<Option<Vec<u8>>> {
453        let this = self.clone();
454        spawn_blocking(move || this.ino_lookup_blocking(subvol, objectid)).await
455    }
456
457    /// Resolve every path in `subvol` that names `objectid`. A regular
458    /// inode has a single path; hard-linked files have one entry per
459    /// link, in unspecified order. Returns an empty vector for an
460    /// orphan inode (no `INODE_REF` / `INODE_EXTREF`).
461    ///
462    /// Each returned path is relative to the subvolume root, with no
463    /// leading slash.
464    pub async fn ino_paths(
465        &self,
466        subvol: SubvolId,
467        objectid: u64,
468    ) -> io::Result<Vec<Vec<u8>>> {
469        let this = self.clone();
470        spawn_blocking(move || this.ino_paths_blocking(subvol, objectid)).await
471    }
472
473    /// Filesystem sectorsize.
474    #[must_use]
475    pub fn blksize(&self) -> u32 {
476        self.inner.blksize
477    }
478
479    /// Drop cached state for `ino` from both the inode and
480    /// extent-map caches. Embedders that observe inode-level
481    /// invalidation events (FUSE `forget`, manual cache pressure)
482    /// can call this to release memory ahead of LRU eviction.
483    /// Safe to call for an inode that's never been cached: it's a
484    /// no-op in that case.
485    pub fn forget(&self, ino: Inode) {
486        self.inner.inode_cache.invalidate(ino);
487        self.inner.extent_map_cache.invalidate(ino);
488    }
489
490    /// Look up a child of `parent` by name.
491    pub async fn lookup(
492        &self,
493        parent: Inode,
494        name: &[u8],
495    ) -> io::Result<Option<(Inode, InodeItem)>> {
496        let this = self.clone();
497        let name = name.to_vec();
498        spawn_blocking(move || this.lookup_blocking(parent, &name)).await
499    }
500
501    /// Read the inode item for `ino`.
502    pub async fn read_inode_item(
503        &self,
504        ino: Inode,
505    ) -> io::Result<Option<InodeItem>> {
506        let this = self.clone();
507        spawn_blocking(move || this.read_inode_item_blocking(ino)).await
508    }
509
510    /// Read inode metadata as a [`Stat`].
511    pub async fn getattr(&self, ino: Inode) -> io::Result<Option<Stat>> {
512        let this = self.clone();
513        spawn_blocking(move || this.getattr_blocking(ino)).await
514    }
515
516    /// List the entries of a directory inode, starting strictly after
517    /// `offset`. `.` and `..` are synthesised at offsets 0 and 1.
518    pub async fn readdir(
519        &self,
520        dir_ino: Inode,
521        offset: u64,
522    ) -> io::Result<Vec<Entry>> {
523        let this = self.clone();
524        spawn_blocking(move || this.readdir_blocking(dir_ino, offset)).await
525    }
526
527    /// Like [`Filesystem::readdir`] but pairs each [`Entry`] with its
528    /// [`Stat`] so callers don't need a separate `getattr` per entry.
529    /// The FUSE driver feeds this into the kernel's `READDIRPLUS`
530    /// path, which collapses `ls -l`-style listings into one round
531    /// trip.
532    ///
533    /// Entries that vanish between the directory walk and the stat
534    /// (effectively impossible on a read-only mount, but defended
535    /// for robustness) are dropped from the result rather than
536    /// erroring.
537    pub async fn readdirplus(
538        &self,
539        dir_ino: Inode,
540        offset: u64,
541    ) -> io::Result<Vec<(Entry, Stat)>> {
542        let this = self.clone();
543        spawn_blocking(move || this.readdirplus_blocking(dir_ino, offset)).await
544    }
545
546    /// Read the target of a symbolic link.
547    pub async fn readlink(&self, ino: Inode) -> io::Result<Option<Vec<u8>>> {
548        let this = self.clone();
549        spawn_blocking(move || this.readlink_blocking(ino)).await
550    }
551
552    /// Read `size` bytes from `ino` starting at `offset`. Sparse holes
553    /// and prealloc extents return zeros; compressed extents are
554    /// decompressed.
555    pub async fn read(
556        &self,
557        ino: Inode,
558        offset: u64,
559        size: u32,
560    ) -> io::Result<Vec<u8>> {
561        let this = self.clone();
562        spawn_blocking(move || this.read_blocking(ino, offset, size)).await
563    }
564
565    /// Resolve a slash-separated subvolume path (relative to the
566    /// filesystem root) to its [`SubvolId`]. Empty path / `"/"`
567    /// resolves to the default `FS_TREE` (`SubvolId(5)`).
568    ///
569    /// Walks the same `list_subvolumes` graph that
570    /// `btrfs subvolume list` uses, so any path the user could see
571    /// in that listing is resolvable here. Returns `Ok(None)` if
572    /// the path doesn't match any subvolume.
573    pub async fn resolve_subvol_path(
574        &self,
575        path: &str,
576    ) -> io::Result<Option<SubvolId>> {
577        let trimmed = path.trim_matches('/');
578        if trimmed.is_empty() {
579            return Ok(Some(SubvolId(FS_TREE_OBJECTID)));
580        }
581        let subvols = self.list_subvolumes().await?;
582        let by_id: BTreeMap<SubvolId, &SubvolInfo> =
583            subvols.iter().map(|s| (s.id, s)).collect();
584        let target = trimmed.as_bytes();
585        for s in &subvols {
586            if subvol_full_path(&by_id, s.id) == target {
587                return Ok(Some(s.id));
588            }
589        }
590        Ok(None)
591    }
592
593    /// Generate a v1 send stream describing `snapshot` and write it
594    /// to `output`. Tier 1 of the send roadmap: full sends only
595    /// (no `parent`), no clone sources, no encoded-write
596    /// passthrough.
597    ///
598    /// The stream begins with a `SUBVOL` command, then walks the
599    /// subvolume tree path-first emitting per-inode creation
600    /// commands (`Mkfile` / `Mkdir` / `Symlink` / `Mknod` / `Mkfifo`
601    /// / `Mksock`), `SetXattr` for each xattr, `Write` chunks for
602    /// regular file contents, and `Truncate` / `Chmod` / `Chown` /
603    /// `Utimes` to finalise. Terminates with `End`.
604    ///
605    /// Hardlinks beyond the first reference become `Link` commands
606    /// rather than re-creating the inode. Subvolume crossings
607    /// (`DirItem` whose `location.key_type == ROOT_ITEM`) are
608    /// skipped — caller must run `send` again per subvolume.
609    ///
610    /// Encodes paths as UTF-8 (lossy on invalid byte sequences).
611    /// Real btrfs filenames are arbitrary bytes; full-fidelity
612    /// non-UTF-8 support can come later if a real workload needs
613    /// it.
614    ///
615    /// # Errors
616    ///
617    /// Returns an error if the subvolume isn't found, any tree
618    /// read fails, or the underlying writer fails.
619    pub async fn send<W: io::Write + Send + 'static>(
620        &self,
621        snapshot: SubvolId,
622        output: W,
623    ) -> io::Result<W> {
624        let this = self.clone();
625        spawn_blocking(move || this.send_blocking(snapshot, output)).await
626    }
627
628    /// Find the next hole or data region in `ino` at or after
629    /// `offset`. Mirrors `lseek(fd, offset, SEEK_HOLE)` /
630    /// `lseek(fd, offset, SEEK_DATA)` semantics.
631    ///
632    /// `SEEK_DATA` returns the offset of the next byte that is part
633    /// of a data region. Returns `Err(ENXIO)` if no data exists at
634    /// or after `offset` (e.g. `offset >= file_size`).
635    ///
636    /// `SEEK_HOLE` returns the offset of the next hole. EOF is
637    /// always considered a virtual hole, so this succeeds for any
638    /// `offset < file_size`. Returns `Err(ENXIO)` only when
639    /// `offset >= file_size`.
640    ///
641    /// Holes include both implicit (gaps with no `EXTENT_DATA` item)
642    /// and explicit (regular extent with `disk_bytenr == 0`)
643    /// representations. Inline and prealloc extents are treated as
644    /// data — matching kernel btrfs and POSIX convention.
645    pub async fn seek_hole_data(
646        &self,
647        ino: Inode,
648        offset: u64,
649        whence: SeekHoleData,
650    ) -> io::Result<u64> {
651        let this = self.clone();
652        spawn_blocking(move || {
653            this.seek_hole_data_blocking(ino, offset, whence)
654        })
655        .await
656    }
657
658    /// List all xattr names for an inode.
659    pub async fn xattr_list(&self, ino: Inode) -> io::Result<Vec<Vec<u8>>> {
660        let this = self.clone();
661        spawn_blocking(move || this.xattr_list_blocking(ino)).await
662    }
663
664    /// Look up the value of a single xattr by exact name.
665    pub async fn xattr_get(
666        &self,
667        ino: Inode,
668        name: &[u8],
669    ) -> io::Result<Option<Vec<u8>>> {
670        let this = self.clone();
671        let name = name.to_vec();
672        spawn_blocking(move || this.xattr_get_blocking(ino, &name)).await
673    }
674
675    /// Filesystem-wide statistics pulled straight from the superblock.
676    /// No I/O — sync.
677    #[must_use]
678    pub fn statfs(&self) -> StatFs {
679        let sb = &self.inner.superblock;
680        let bsize = u64::from(sb.sectorsize);
681        let blocks = sb.total_bytes / bsize;
682        let bfree = sb.total_bytes.saturating_sub(sb.bytes_used) / bsize;
683        StatFs {
684            blocks,
685            bfree,
686            bavail: bfree,
687            bsize: sb.sectorsize,
688            namelen: 255,
689            frsize: sb.sectorsize,
690        }
691    }
692
693    // ── Sync (blocking) implementations ─────────────────────────────
694    //
695    // The `_blocking` methods carry the actual logic. They run inside
696    // `spawn_blocking`, so they're allowed to take the reader Mutex
697    // and do sync I/O without blocking the runtime.
698
699    fn lookup_blocking(
700        &self,
701        parent: Inode,
702        name: &[u8],
703    ) -> io::Result<Option<(Inode, InodeItem)>> {
704        let parent_tree = self.tree_root_for(parent.subvol)?;
705        let mut reader = self.lock_reader();
706        let Some(entry) =
707            lookup_in_dir(&mut reader, parent_tree, parent.ino, name)?
708        else {
709            return Ok(None);
710        };
711
712        // Subvolume crossing: a `DirItem` whose `location` points at a
713        // `ROOT_ITEM` key is not a regular dirent — it's a mount of
714        // another subvolume's root. The child inode lives at objectid
715        // 256 of that subvolume's tree, not at `entry.location.objectid`
716        // in the parent's tree.
717        let child = if entry.location.key_type == KeyType::RootItem {
718            Inode {
719                subvol: SubvolId(entry.location.objectid),
720                ino: ROOT_DIR_OBJECTID,
721            }
722        } else {
723            Inode {
724                subvol: parent.subvol,
725                ino: entry.location.objectid,
726            }
727        };
728
729        // Reuse the cached InodeItem if present; otherwise fetch and
730        // populate. The fetch uses the *child*'s tree (which equals
731        // the parent's for non-crossings).
732        let item = if let Some(cached) = self.inner.inode_cache.get(child) {
733            (*cached).clone()
734        } else {
735            let child_tree = self.tree_root_for(child.subvol)?;
736            let Some(item) = read_inode(&mut reader, child_tree, child.ino)?
737            else {
738                return Ok(None);
739            };
740            self.inner.inode_cache.put(child, Arc::new(item.clone()));
741            item
742        };
743        Ok(Some((child, item)))
744    }
745
746    fn read_inode_item_blocking(
747        &self,
748        ino: Inode,
749    ) -> io::Result<Option<InodeItem>> {
750        if let Some(cached) = self.inner.inode_cache.get(ino) {
751            return Ok(Some((*cached).clone()));
752        }
753        let tree_root = self.tree_root_for(ino.subvol)?;
754        let mut reader = self.lock_reader();
755        let Some(item) = read_inode(&mut reader, tree_root, ino.ino)? else {
756            return Ok(None);
757        };
758        self.inner.inode_cache.put(ino, Arc::new(item.clone()));
759        Ok(Some(item))
760    }
761
762    fn getattr_blocking(&self, ino: Inode) -> io::Result<Option<Stat>> {
763        Ok(self
764            .read_inode_item_blocking(ino)?
765            .map(|item| Stat::from_inode(ino, &item, self.inner.blksize)))
766    }
767
768    fn readdir_blocking(
769        &self,
770        dir_ino: Inode,
771        offset: u64,
772    ) -> io::Result<Vec<Entry>> {
773        let tree_root = self.tree_root_for(dir_ino.subvol)?;
774        let mut entries: Vec<Entry> = Vec::new();
775        let mut reader = self.lock_reader();
776
777        if offset == 0 {
778            entries.push(Entry {
779                ino: dir_ino,
780                kind: FileKind::Directory,
781                name: b".".to_vec(),
782                offset: 1,
783            });
784        }
785        if offset <= 1 {
786            // For most inodes `..` lives in the same subvolume tree
787            // (walk INODE_REF). For a subvolume root we instead walk
788            // ROOT_BACKREF in the root tree to find the parent
789            // subvolume's containing directory.
790            let parent = if dir_ino.ino == ROOT_DIR_OBJECTID
791                && dir_ino.subvol.0 != FS_TREE_OBJECTID
792            {
793                find_root_backref_parent(
794                    &mut reader,
795                    self.inner.superblock.root,
796                    dir_ino.subvol.0,
797                )?
798                .unwrap_or(dir_ino)
799            } else {
800                let parent_oid =
801                    find_parent_oid(&mut reader, tree_root, dir_ino.ino)?;
802                Inode {
803                    subvol: dir_ino.subvol,
804                    ino: parent_oid,
805                }
806            };
807            entries.push(Entry {
808                ino: parent,
809                kind: FileKind::Directory,
810                name: b"..".to_vec(),
811                offset: 2,
812            });
813        }
814
815        let cursor = offset.max(2);
816        let mut dir_entries: Vec<Entry> = Vec::new();
817        for_each_item(&mut reader, tree_root, |key, data| {
818            if key.objectid != dir_ino.ino || key.key_type != KeyType::DirIndex
819            {
820                return;
821            }
822            if key.offset < cursor {
823                return;
824            }
825            for item in DirItem::parse_all(data) {
826                // Cookie is "next offset to start from", so add 1.
827                let entry = dir::Entry::from_dir_item(
828                    dir_ino.subvol,
829                    &item,
830                    key.offset + 1,
831                );
832                dir_entries.push(entry);
833            }
834        })?;
835        dir_entries.sort_by_key(|e| e.offset);
836        entries.extend(dir_entries);
837        Ok(entries)
838    }
839
840    fn readdirplus_blocking(
841        &self,
842        dir_ino: Inode,
843        offset: u64,
844    ) -> io::Result<Vec<(Entry, Stat)>> {
845        let entries = self.readdir_blocking(dir_ino, offset)?;
846        let blksize = self.inner.blksize;
847        let mut out = Vec::with_capacity(entries.len());
848        for entry in entries {
849            // `read_inode_item_blocking` consults the inode cache
850            // first, so repeated `readdirplus` over the same dir
851            // pays at most one tree walk per inode across the
852            // working set.
853            if let Some(item) = self.read_inode_item_blocking(entry.ino)? {
854                let stat = Stat::from_inode(entry.ino, &item, blksize);
855                out.push((entry, stat));
856            }
857        }
858        Ok(out)
859    }
860
861    fn readlink_blocking(&self, ino: Inode) -> io::Result<Option<Vec<u8>>> {
862        let Some(item) = self.read_inode_item_blocking(ino)? else {
863            return Ok(None);
864        };
865        let tree_root = self.tree_root_for(ino.subvol)?;
866        let blksize = self.inner.blksize;
867        let mut reader = self.lock_reader();
868        let target =
869            read::read_symlink(&mut reader, tree_root, ino.ino, blksize)?;
870        #[allow(clippy::cast_possible_truncation)]
871        Ok(target.map(|mut t| {
872            t.truncate(item.size as usize);
873            t
874        }))
875    }
876
877    fn read_blocking(
878        &self,
879        ino: Inode,
880        offset: u64,
881        size: u32,
882    ) -> io::Result<Vec<u8>> {
883        let Some(item) = self.read_inode_item_blocking(ino)? else {
884            return Err(io::Error::new(
885                io::ErrorKind::NotFound,
886                format!("inode {} not found", ino.ino),
887            ));
888        };
889        let tree_root = self.tree_root_for(ino.subvol)?;
890        let blksize = self.inner.blksize;
891        // Build or fetch the extent map so repeated reads of the same
892        // file don't re-walk the FS tree.
893        let extent_map = self.extent_map_for(ino, tree_root)?;
894        let mut reader = self.lock_reader();
895        read::read_file_with_map(
896            &mut reader,
897            &extent_map.records,
898            item.size,
899            offset,
900            size,
901            blksize,
902        )
903    }
904
905    fn send_blocking<W: io::Write>(
906        &self,
907        snapshot: SubvolId,
908        output: W,
909    ) -> io::Result<W> {
910        let info = self
911            .list_subvolumes_blocking()?
912            .into_iter()
913            .find(|s| s.id == snapshot)
914            .ok_or_else(|| {
915                io::Error::new(
916                    io::ErrorKind::NotFound,
917                    format!("subvolume {} not found", snapshot.0),
918                )
919            })?;
920
921        let mut writer = StreamWriter::new(output, SEND_STREAM_VERSION)?;
922
923        // The SUBVOL command names the receive-side directory. We use
924        // the subvolume's recorded name; for the default `FS_TREE`
925        // (no name) fall back to a synthetic identifier so receive
926        // has something to mkdir with.
927        let subvol_path = if info.name.is_empty() {
928            format!("subvol-{}", info.id.0)
929        } else {
930            String::from_utf8_lossy(&info.name).into_owned()
931        };
932        writer.write_command(&StreamCommand::Subvol {
933            path: subvol_path,
934            uuid: info.uuid,
935            ctransid: info.ctransid,
936        })?;
937
938        // Walk the subvolume tree starting at the root directory.
939        // `seen` tracks inodes we've already created so a second
940        // `INODE_REF` for the same inode emits `Link` instead of a
941        // duplicate creation. Stores the first emitted path so the
942        // `Link` target is reachable.
943        let root = Inode {
944            subvol: snapshot,
945            ino: ROOT_DIR_OBJECTID,
946        };
947        let mut seen: BTreeMap<u64, String> = BTreeMap::new();
948        seen.insert(ROOT_DIR_OBJECTID, String::new());
949        self.send_dir_recursive(&mut writer, root, "", &mut seen)?;
950
951        writer.write_command(&StreamCommand::End)?;
952        writer.finish()
953    }
954
955    fn send_dir_recursive<W: io::Write>(
956        &self,
957        writer: &mut StreamWriter<W>,
958        dir: Inode,
959        dir_path: &str,
960        seen: &mut BTreeMap<u64, String>,
961    ) -> io::Result<()> {
962        // Skip `.` and `..` (offsets 0 and 1) — those are synthetic.
963        let entries = self.readdir_blocking(dir, 1)?;
964        let mut subdirs: Vec<(Inode, String)> = Vec::new();
965        for entry in entries {
966            // Skip the synthetic `.` / `..` slots.
967            if entry.name == b"." || entry.name == b".." {
968                continue;
969            }
970            // Subvolume crossings are out of scope for tier 1: the
971            // child subvolume is a separate tree and would need its
972            // own SUBVOL/SNAPSHOT command. Caller can re-invoke
973            // send() on each subvol they want.
974            if entry.ino.subvol != dir.subvol {
975                continue;
976            }
977            let entry_name = String::from_utf8_lossy(&entry.name).into_owned();
978            let entry_path = if dir_path.is_empty() {
979                entry_name
980            } else {
981                format!("{dir_path}/{entry_name}")
982            };
983
984            // Hardlink case: we've already emitted the inode under
985            // its first path. Just attach a Link and move on.
986            if let Some(first_path) = seen.get(&entry.ino.ino) {
987                writer.write_command(&StreamCommand::Link {
988                    path: entry_path.clone(),
989                    target: first_path.clone(),
990                })?;
991                continue;
992            }
993            let item =
994                self.read_inode_item_blocking(entry.ino)?.ok_or_else(|| {
995                    io::Error::new(
996                        io::ErrorKind::NotFound,
997                        format!("inode {} item missing", entry.ino.ino),
998                    )
999                })?;
1000            seen.insert(entry.ino.ino, entry_path.clone());
1001
1002            self.send_create_command(writer, &entry, &entry_path, &item)?;
1003            self.send_xattrs(writer, entry.ino, &entry_path)?;
1004            if entry.kind == FileKind::RegularFile {
1005                self.send_file_data(writer, entry.ino, &entry_path, item.size)?;
1006                writer.write_command(&StreamCommand::Truncate {
1007                    path: entry_path.clone(),
1008                    size: item.size,
1009                })?;
1010            }
1011            send_metadata(writer, &entry_path, &item)?;
1012
1013            // Defer recursion until after we've finished this
1014            // directory's own entries — keeps the per-dir command
1015            // ordering cleaner.
1016            if entry.kind == FileKind::Directory {
1017                subdirs.push((entry.ino, entry_path));
1018            }
1019        }
1020        for (subdir_ino, subdir_path) in subdirs {
1021            self.send_dir_recursive(writer, subdir_ino, &subdir_path, seen)?;
1022        }
1023        Ok(())
1024    }
1025
1026    fn send_create_command<W: io::Write>(
1027        &self,
1028        writer: &mut StreamWriter<W>,
1029        entry: &Entry,
1030        path: &str,
1031        item: &InodeItem,
1032    ) -> io::Result<()> {
1033        let cmd = match entry.kind {
1034            FileKind::RegularFile => {
1035                StreamCommand::Mkfile { path: path.into() }
1036            }
1037            FileKind::Directory => StreamCommand::Mkdir { path: path.into() },
1038            FileKind::Symlink => {
1039                let target =
1040                    self.readlink_blocking(entry.ino)?.ok_or_else(|| {
1041                        io::Error::new(
1042                            io::ErrorKind::NotFound,
1043                            format!("symlink {} target missing", entry.ino.ino),
1044                        )
1045                    })?;
1046                StreamCommand::Symlink {
1047                    path: path.into(),
1048                    target: String::from_utf8_lossy(&target).into_owned(),
1049                }
1050            }
1051            FileKind::NamedPipe => StreamCommand::Mkfifo { path: path.into() },
1052            FileKind::Socket => StreamCommand::Mksock { path: path.into() },
1053            FileKind::BlockDevice | FileKind::CharDevice => {
1054                StreamCommand::Mknod {
1055                    path: path.into(),
1056                    mode: u64::from(item.mode),
1057                    rdev: item.rdev,
1058                }
1059            }
1060        };
1061        writer.write_command(&cmd)
1062    }
1063
1064    fn send_xattrs<W: io::Write>(
1065        &self,
1066        writer: &mut StreamWriter<W>,
1067        ino: Inode,
1068        path: &str,
1069    ) -> io::Result<()> {
1070        for name in self.xattr_list_blocking(ino)? {
1071            let Some(data) = self.xattr_get_blocking(ino, &name)? else {
1072                continue;
1073            };
1074            writer.write_command(&StreamCommand::SetXattr {
1075                path: path.into(),
1076                name: String::from_utf8_lossy(&name).into_owned(),
1077                data,
1078            })?;
1079        }
1080        Ok(())
1081    }
1082
1083    /// Emit `Write` commands covering `[0, size)` of `ino` in
1084    /// chunks that fit comfortably in v1's u16 TLV length field
1085    /// (we cap at [`SEND_WRITE_CHUNK_BYTES`] to leave headroom for
1086    /// the path/offset attributes). `read_blocking` materialises
1087    /// holes and prealloc as zeros and decompresses any compressed
1088    /// extents, so `data` is always plain bytes.
1089    fn send_file_data<W: io::Write>(
1090        &self,
1091        writer: &mut StreamWriter<W>,
1092        ino: Inode,
1093        path: &str,
1094        size: u64,
1095    ) -> io::Result<()> {
1096        let mut offset = 0u64;
1097        while offset < size {
1098            let remaining = size - offset;
1099            #[allow(clippy::cast_possible_truncation)]
1100            let chunk = remaining.min(SEND_WRITE_CHUNK_BYTES as u64) as u32;
1101            let data = self.read_blocking(ino, offset, chunk)?;
1102            if data.is_empty() {
1103                break;
1104            }
1105            writer.write_command(&StreamCommand::Write {
1106                path: path.into(),
1107                offset,
1108                data,
1109            })?;
1110            offset += u64::from(chunk);
1111        }
1112        Ok(())
1113    }
1114
1115    fn seek_hole_data_blocking(
1116        &self,
1117        ino: Inode,
1118        offset: u64,
1119        whence: SeekHoleData,
1120    ) -> io::Result<u64> {
1121        let item = self.read_inode_item_blocking(ino)?.ok_or_else(|| {
1122            io::Error::new(
1123                io::ErrorKind::NotFound,
1124                format!("inode {} not found", ino.ino),
1125            )
1126        })?;
1127        let file_size = item.size;
1128
1129        // POSIX: any whence at or past EOF is ENXIO. Linux returns
1130        // ENXIO for both SEEK_HOLE and SEEK_DATA in that case.
1131        if offset >= file_size {
1132            return Err(io::Error::from_raw_os_error(libc::ENXIO));
1133        }
1134
1135        let tree_root = self.tree_root_for(ino.subvol)?;
1136        let extent_map = self.extent_map_for(ino, tree_root)?;
1137        let want_hole = matches!(whence, SeekHoleData::Hole);
1138
1139        // Walk records in file_pos order, classifying each region as
1140        // data or hole. An implicit hole sits before any record whose
1141        // file_pos > cursor; a regular extent with disk_bytenr == 0
1142        // is an explicit hole; inline and prealloc count as data.
1143        let mut cursor = 0u64;
1144        for r in &extent_map.records {
1145            // Implicit hole [cursor, r.file_pos).
1146            if r.file_pos > cursor {
1147                let hole_end = r.file_pos.min(file_size);
1148                if hole_end > offset && want_hole {
1149                    return Ok(offset.max(cursor));
1150                }
1151                cursor = hole_end;
1152                if cursor >= file_size {
1153                    break;
1154                }
1155            }
1156            let body_len = match &r.item.body {
1157                FileExtentBody::Inline { .. } => r.item.ram_bytes,
1158                FileExtentBody::Regular { num_bytes, .. } => *num_bytes,
1159            };
1160            let r_start = r.file_pos.max(cursor);
1161            let r_end = (r.file_pos + body_len).min(file_size);
1162            if r_end <= r_start {
1163                continue;
1164            }
1165            let r_is_hole = matches!(
1166                &r.item.body,
1167                FileExtentBody::Regular { disk_bytenr: 0, .. },
1168            );
1169            if r_end > offset && r_is_hole == want_hole {
1170                return Ok(offset.max(r_start));
1171            }
1172            cursor = r_end;
1173            if cursor >= file_size {
1174                break;
1175            }
1176        }
1177
1178        // Past every record, the rest of the file (if any) is a
1179        // trailing implicit hole. SEEK_HOLE additionally treats EOF
1180        // itself as a virtual hole, so it always finds *something*
1181        // for any offset within the file.
1182        if want_hole {
1183            if cursor < file_size && cursor > offset {
1184                Ok(cursor)
1185            } else if offset < file_size {
1186                // No real hole found; report the virtual EOF hole.
1187                Ok(file_size)
1188            } else {
1189                Err(io::Error::from_raw_os_error(libc::ENXIO))
1190            }
1191        } else {
1192            Err(io::Error::from_raw_os_error(libc::ENXIO))
1193        }
1194    }
1195
1196    /// Build (or fetch from cache) the [`ExtentMap`] for `ino`.
1197    fn extent_map_for(
1198        &self,
1199        ino: Inode,
1200        tree_root: u64,
1201    ) -> io::Result<Arc<crate::cache::ExtentMap>> {
1202        if let Some(cached) = self.inner.extent_map_cache.get(ino) {
1203            return Ok(cached);
1204        }
1205        let mut reader = self.lock_reader();
1206        let records = read::collect_extents(&mut reader, tree_root, ino.ino)?;
1207        drop(reader);
1208        let map = Arc::new(crate::cache::ExtentMap { records });
1209        self.inner.extent_map_cache.put(ino, Arc::clone(&map));
1210        Ok(map)
1211    }
1212
1213    fn xattr_list_blocking(&self, ino: Inode) -> io::Result<Vec<Vec<u8>>> {
1214        let tree_root = self.tree_root_for(ino.subvol)?;
1215        let mut reader = self.lock_reader();
1216        xattr::list_xattrs(&mut reader, tree_root, ino.ino)
1217    }
1218
1219    fn xattr_get_blocking(
1220        &self,
1221        ino: Inode,
1222        name: &[u8],
1223    ) -> io::Result<Option<Vec<u8>>> {
1224        let tree_root = self.tree_root_for(ino.subvol)?;
1225        let mut reader = self.lock_reader();
1226        xattr::get_xattr(&mut reader, tree_root, ino.ino, name)
1227    }
1228
1229    fn tree_search_blocking(
1230        &self,
1231        filter: SearchFilter,
1232        max_buf_size: usize,
1233    ) -> io::Result<Vec<SearchItem>> {
1234        // The root tree itself is at superblock.root; subvolume trees
1235        // are looked up via tree_root_for. tree_id == 1 is the root
1236        // tree (BTRFS_ROOT_TREE_OBJECTID). Anything else has to be a
1237        // subvolume id we know about.
1238        // sizeof(btrfs_ioctl_search_header).
1239        const HEADER_SIZE: usize = 32;
1240
1241        let tree_root = if filter.tree_id == 1 {
1242            self.inner.superblock.root
1243        } else {
1244            self.tree_root_for(SubvolId(filter.tree_id))?
1245        };
1246        let min = (filter.min_objectid, filter.min_type, filter.min_offset);
1247        let max = (filter.max_objectid, filter.max_type, filter.max_offset);
1248
1249        let mut results: Vec<SearchItem> = Vec::new();
1250        let mut buf_used: usize = 0;
1251        let mut reader = self.lock_reader();
1252        tree_walk(&mut reader, tree_root, Traversal::Dfs, &mut |block| {
1253            if results.len() >= filter.max_items as usize {
1254                return;
1255            }
1256            let TreeBlock::Leaf {
1257                items,
1258                data,
1259                header,
1260            } = block
1261            else {
1262                return;
1263            };
1264            let leaf_transid = header.generation;
1265            if leaf_transid < filter.min_transid
1266                || leaf_transid > filter.max_transid
1267            {
1268                return;
1269            }
1270            let hdr_size = mem::size_of::<btrfs_disk::raw::btrfs_header>();
1271            for item in items {
1272                if results.len() >= filter.max_items as usize {
1273                    return;
1274                }
1275                let key = &item.key;
1276                let item_type = u32::from(key.key_type.to_raw());
1277                let compound = (key.objectid, item_type, key.offset);
1278                if compound < min || compound > max {
1279                    continue;
1280                }
1281                let start = hdr_size + item.offset as usize;
1282                let end = start + item.size as usize;
1283                if end > data.len() {
1284                    continue;
1285                }
1286                let payload = &data[start..end];
1287                let next_used = buf_used + HEADER_SIZE + payload.len();
1288                if next_used > max_buf_size {
1289                    // Stop entirely — no more items will fit.
1290                    return;
1291                }
1292                results.push(SearchItem {
1293                    transid: leaf_transid,
1294                    objectid: key.objectid,
1295                    item_type,
1296                    offset: key.offset,
1297                    data: payload.to_vec(),
1298                });
1299                buf_used = next_used;
1300            }
1301        })?;
1302        Ok(results)
1303    }
1304
1305    fn ino_lookup_blocking(
1306        &self,
1307        subvol: SubvolId,
1308        objectid: u64,
1309    ) -> io::Result<Option<Vec<u8>>> {
1310        // The subvolume root has no `INODE_REF`; an empty path is
1311        // the right answer for it.
1312        if objectid == ROOT_DIR_OBJECTID {
1313            return Ok(Some(Vec::new()));
1314        }
1315        let tree_root = self.tree_root_for(subvol)?;
1316        let mut reader = self.lock_reader();
1317        let mut components: Vec<Vec<u8>> = Vec::new();
1318        let mut current = objectid;
1319        // Bound the walk so a corrupted INODE_REF cycle can't hang
1320        // the FUSE worker forever. 4096 nesting levels is more than
1321        // any sane btrfs depth.
1322        for _ in 0..4096 {
1323            if current == ROOT_DIR_OBJECTID {
1324                components.reverse();
1325                return Ok(Some(join_path(&components)));
1326            }
1327            let mut next_parent: Option<u64> = None;
1328            let mut name: Option<Vec<u8>> = None;
1329            for_each_item(&mut reader, tree_root, |key, data| {
1330                if next_parent.is_some() {
1331                    return;
1332                }
1333                if key.objectid == current && key.key_type == KeyType::InodeRef
1334                {
1335                    // A single INODE_REF item may pack multiple
1336                    // entries (one per hardlink to this inode). The
1337                    // kernel `INO_LOOKUP` picks the first; do the
1338                    // same.
1339                    if let Some(iref) =
1340                        InodeRef::parse_all(data).into_iter().next()
1341                    {
1342                        next_parent = Some(key.offset);
1343                        name = Some(iref.name);
1344                    }
1345                }
1346            })?;
1347            match (next_parent, name) {
1348                (Some(p), Some(n)) => {
1349                    components.push(n);
1350                    current = p;
1351                }
1352                _ => return Ok(None),
1353            }
1354        }
1355        Err(io::Error::new(
1356            io::ErrorKind::InvalidData,
1357            format!("INODE_REF chain for objectid {objectid} too deep"),
1358        ))
1359    }
1360
1361    fn ino_paths_blocking(
1362        &self,
1363        subvol: SubvolId,
1364        objectid: u64,
1365    ) -> io::Result<Vec<Vec<u8>>> {
1366        // The subvolume root has no INODE_REF; the empty path names it.
1367        if objectid == ROOT_DIR_OBJECTID {
1368            return Ok(vec![Vec::new()]);
1369        }
1370        let tree_root = self.tree_root_for(subvol)?;
1371        // Collect all (parent_dirid, name) pairs in one tree walk.
1372        // INODE_REF packs every link to the same parent dir into one
1373        // item; INODE_EXTREF holds links whose name+parent pair didn't
1374        // fit (typically across many parents), with the parent stored
1375        // in the struct rather than the key offset.
1376        let mut refs: Vec<(u64, Vec<u8>)> = Vec::new();
1377        let mut reader = self.lock_reader();
1378        for_each_item(&mut reader, tree_root, |key, data| {
1379            if key.objectid != objectid {
1380                return;
1381            }
1382            match key.key_type {
1383                KeyType::InodeRef => {
1384                    for iref in InodeRef::parse_all(data) {
1385                        refs.push((key.offset, iref.name));
1386                    }
1387                }
1388                KeyType::InodeExtref => {
1389                    for eref in InodeExtref::parse_all(data) {
1390                        refs.push((eref.parent, eref.name));
1391                    }
1392                }
1393                _ => {}
1394            }
1395        })?;
1396        drop(reader);
1397
1398        // For each (parent, name) prepend the parent's path. We reuse
1399        // ino_lookup_blocking which re-acquires the reader lock, so it
1400        // matters that the lock is released above.
1401        let mut paths = Vec::with_capacity(refs.len());
1402        for (parent, name) in refs {
1403            let Some(parent_path) = self.ino_lookup_blocking(subvol, parent)?
1404            else {
1405                continue;
1406            };
1407            let mut p = parent_path;
1408            if !p.is_empty() {
1409                p.push(b'/');
1410            }
1411            p.extend_from_slice(&name);
1412            paths.push(p);
1413        }
1414        Ok(paths)
1415    }
1416
1417    fn list_subvolumes_blocking(&self) -> io::Result<Vec<SubvolInfo>> {
1418        let root_tree = self.inner.superblock.root;
1419        // Collect ROOT_ITEM (id → metadata) and ROOT_BACKREF (id →
1420        // (parent, name)) in a single root-tree walk. Each subvolume
1421        // has at most one BACKREF; we keep the first if any duplicate
1422        // appears.
1423        let mut roots: BTreeMap<u64, RootItem> = BTreeMap::new();
1424        let mut backrefs: BTreeMap<u64, (u64, RootRef)> = BTreeMap::new();
1425        let mut reader = self.lock_reader();
1426        for_each_item(&mut reader, root_tree, |key, data| {
1427            match key.key_type {
1428                KeyType::RootItem if is_subvolume_id(key.objectid) => {
1429                    if let Some(item) = RootItem::parse(data) {
1430                        roots.entry(key.objectid).or_insert(item);
1431                    }
1432                }
1433                KeyType::RootBackref => {
1434                    if let Some(rr) = RootRef::parse(data) {
1435                        backrefs
1436                            .entry(key.objectid)
1437                            .or_insert((key.offset, rr));
1438                    }
1439                }
1440                _ => {}
1441            }
1442        })?;
1443        drop(reader);
1444
1445        let mut out = Vec::with_capacity(roots.len());
1446        for (id, item) in roots {
1447            let (parent, name, dirid) = match backrefs.get(&id) {
1448                Some((parent_id, rr)) => {
1449                    (Some(SubvolId(*parent_id)), rr.name.clone(), rr.dirid)
1450                }
1451                None => (None, Vec::new(), 0),
1452            };
1453            out.push(SubvolInfo {
1454                id: SubvolId(id),
1455                parent,
1456                name,
1457                dirid,
1458                readonly: item.flags.contains(RootItemFlags::RDONLY),
1459                ctime: to_system_time(&item.ctime),
1460                otime: to_system_time(&item.otime),
1461                generation: item.generation,
1462                ctransid: item.ctransid,
1463                otransid: item.otransid,
1464                uuid: item.uuid,
1465                parent_uuid: item.parent_uuid,
1466                received_uuid: item.received_uuid,
1467            });
1468        }
1469        Ok(out)
1470    }
1471
1472    /// Acquire the I/O lock. Forwards a poisoned mutex without
1473    /// unwrapping at every call site.
1474    fn lock_reader(&self) -> MutexGuard<'_, BlockReader<R>> {
1475        self.inner.reader.lock().unwrap()
1476    }
1477
1478    /// Map a [`SubvolId`] to its tree root logical address.
1479    fn tree_root_for(&self, subvol: SubvolId) -> io::Result<u64> {
1480        if !is_subvolume_id(subvol.0) {
1481            return Err(io::Error::new(
1482                io::ErrorKind::InvalidInput,
1483                format!("{} is not a valid subvolume id", subvol.0),
1484            ));
1485        }
1486        self.inner
1487            .tree_roots
1488            .get(&subvol.0)
1489            .map(|(logical, _)| *logical)
1490            .ok_or_else(|| {
1491                io::Error::new(
1492                    io::ErrorKind::NotFound,
1493                    format!("subvolume {} not found", subvol.0),
1494                )
1495            })
1496    }
1497}
1498
1499/// Run a sync closure on the tokio blocking pool, mapping a
1500/// `JoinError` to an `io::Error` so callers see a single error type.
1501async fn spawn_blocking<F, T>(f: F) -> io::Result<T>
1502where
1503    F: FnOnce() -> io::Result<T> + Send + 'static,
1504    T: Send + 'static,
1505{
1506    tokio::task::spawn_blocking(f)
1507        .await
1508        .map_err(|e| io::Error::other(format!("blocking task failed: {e}")))?
1509}
1510
1511/// DFS the given tree, calling `visitor(item_key, item_data)` for every
1512/// leaf item.
1513fn for_each_item<R, F>(
1514    reader: &mut BlockReader<R>,
1515    tree_root: u64,
1516    mut visitor: F,
1517) -> io::Result<()>
1518where
1519    R: io::Read + io::Seek,
1520    F: FnMut(&btrfs_disk::tree::DiskKey, &[u8]),
1521{
1522    tree_walk(reader, tree_root, Traversal::Dfs, &mut |block| {
1523        if let TreeBlock::Leaf { items, data, .. } = block {
1524            let header_size = mem::size_of::<btrfs_disk::raw::btrfs_header>();
1525            for item in items {
1526                let start = header_size + item.offset as usize;
1527                let end = start + item.size as usize;
1528                if end <= data.len() {
1529                    visitor(&item.key, &data[start..end]);
1530                }
1531            }
1532        }
1533    })
1534}
1535
1536fn read_inode<R: io::Read + io::Seek>(
1537    reader: &mut BlockReader<R>,
1538    tree_root: u64,
1539    objectid: u64,
1540) -> io::Result<Option<InodeItem>> {
1541    let mut found = None;
1542    for_each_item(reader, tree_root, |key, data| {
1543        if found.is_some() {
1544            return;
1545        }
1546        if key.objectid == objectid && key.key_type == KeyType::InodeItem {
1547            found = InodeItem::parse(data);
1548        }
1549    })?;
1550    Ok(found)
1551}
1552
1553fn lookup_in_dir<R: io::Read + io::Seek>(
1554    reader: &mut BlockReader<R>,
1555    tree_root: u64,
1556    parent_objectid: u64,
1557    name: &[u8],
1558) -> io::Result<Option<DirItem>> {
1559    let mut found = None;
1560    for_each_item(reader, tree_root, |key, data| {
1561        if found.is_some() {
1562            return;
1563        }
1564        if key.objectid != parent_objectid || key.key_type != KeyType::DirItem {
1565            return;
1566        }
1567        for item in DirItem::parse_all(data) {
1568            if item.name == name {
1569                found = Some(item);
1570                return;
1571            }
1572        }
1573    })?;
1574    Ok(found)
1575}
1576
1577/// Build the full slash-separated path (no leading slash) of
1578/// subvolume `id` by walking its parent chain. The default
1579/// `FS_TREE` has an empty path; user subvolumes accumulate name
1580/// components from each ancestor. Used by
1581/// [`Filesystem::resolve_subvol_path`].
1582fn subvol_full_path(
1583    by_id: &BTreeMap<SubvolId, &SubvolInfo>,
1584    id: SubvolId,
1585) -> Vec<u8> {
1586    let mut components: Vec<Vec<u8>> = Vec::new();
1587    let mut current = id;
1588    while let Some(info) = by_id.get(&current) {
1589        if !info.name.is_empty() {
1590            components.push(info.name.clone());
1591        }
1592        match info.parent {
1593            Some(parent) => current = parent,
1594            None => break,
1595        }
1596    }
1597    components.reverse();
1598    let mut out: Vec<u8> = Vec::new();
1599    for (i, c) in components.iter().enumerate() {
1600        if i > 0 {
1601            out.push(b'/');
1602        }
1603        out.extend_from_slice(c);
1604    }
1605    out
1606}
1607
1608/// Convert a btrfs on-disk [`DiskTimespec`] into the
1609/// [`StreamTimespec`] shape carried by send-stream `Utimes`
1610/// commands. Both types are `(sec: u64, nsec: u32)` — separate
1611/// types for type-system hygiene rather than wire-level
1612/// difference.
1613fn to_stream_timespec(t: &DiskTimespec) -> StreamTimespec {
1614    StreamTimespec {
1615        sec: t.sec,
1616        nsec: t.nsec,
1617    }
1618}
1619
1620/// Emit the trailing `Chown`/`Chmod`/`Utimes` triple every inode
1621/// gets after creation. Free function rather than a method since
1622/// it doesn't touch the [`Filesystem`] state — only forwards
1623/// fields from the inode item we've already loaded.
1624fn send_metadata<W: io::Write>(
1625    writer: &mut StreamWriter<W>,
1626    path: &str,
1627    item: &InodeItem,
1628) -> io::Result<()> {
1629    writer.write_command(&StreamCommand::Chown {
1630        path: path.into(),
1631        uid: u64::from(item.uid),
1632        gid: u64::from(item.gid),
1633    })?;
1634    writer.write_command(&StreamCommand::Chmod {
1635        path: path.into(),
1636        // Strip the file-type bits; receive applies these as
1637        // permission/setuid/setgid/sticky only.
1638        mode: u64::from(item.mode & 0o7777),
1639    })?;
1640    writer.write_command(&StreamCommand::Utimes {
1641        path: path.into(),
1642        atime: to_stream_timespec(&item.atime),
1643        mtime: to_stream_timespec(&item.mtime),
1644        ctime: to_stream_timespec(&item.ctime),
1645    })?;
1646    Ok(())
1647}
1648
1649/// Join path components with `/` separators (no leading or trailing
1650/// slash). Used by [`Filesystem::ino_lookup_blocking`] to assemble
1651/// the result of an `INODE_REF` walk.
1652fn join_path(components: &[Vec<u8>]) -> Vec<u8> {
1653    let total = components.iter().map(Vec::len).sum::<usize>()
1654        + components.len().saturating_sub(1);
1655    let mut out: Vec<u8> = Vec::with_capacity(total);
1656    for (i, c) in components.iter().enumerate() {
1657        if i > 0 {
1658            out.push(b'/');
1659        }
1660        out.extend_from_slice(c);
1661    }
1662    out
1663}
1664
1665/// Find the parent objectid of `oid` via `INODE_REF`. The `INODE_REF`
1666/// key offset field contains the parent objectid directly. Returns
1667/// `oid` itself if no ref is found (root directory).
1668fn find_parent_oid<R: io::Read + io::Seek>(
1669    reader: &mut BlockReader<R>,
1670    tree_root: u64,
1671    oid: u64,
1672) -> io::Result<u64> {
1673    let mut parent = oid;
1674    for_each_item(reader, tree_root, |key, _data| {
1675        if parent != oid {
1676            return;
1677        }
1678        if key.objectid == oid && key.key_type == KeyType::InodeRef {
1679            parent = key.offset;
1680        }
1681    })?;
1682    Ok(parent)
1683}
1684
1685/// Resolve the parent of a subvolume root via `ROOT_BACKREF` in the
1686/// root tree. Returns `Some(parent_inode)` where `parent_inode` is
1687/// the directory in the parent subvolume that contains this one,
1688/// or `None` for top-level subvolumes (no `ROOT_BACKREF`, e.g. the
1689/// default `FS_TREE`).
1690fn find_root_backref_parent<R: io::Read + io::Seek>(
1691    reader: &mut BlockReader<R>,
1692    root_tree_logical: u64,
1693    child_subvol: u64,
1694) -> io::Result<Option<Inode>> {
1695    let mut found = None;
1696    for_each_item(reader, root_tree_logical, |key, data| {
1697        if found.is_some() {
1698            return;
1699        }
1700        if key.objectid == child_subvol && key.key_type == KeyType::RootBackref
1701        {
1702            if let Some(rr) = RootRef::parse(data) {
1703                found = Some(Inode {
1704                    subvol: SubvolId(key.offset),
1705                    ino: rr.dirid,
1706                });
1707            }
1708        }
1709    })?;
1710    Ok(found)
1711}