Skip to main content

btrfs_uapi/
subvolume.rs

1//! # Subvolume and snapshot management: creating, deleting, and querying subvolumes
2//!
3//! Subvolumes are independently snapshotable subtrees within a btrfs filesystem.
4//! Snapshots are subvolumes created as copy-on-write clones of an existing
5//! subvolume.  This module covers the full lifecycle: creating and deleting
6//! subvolumes and snapshots, reading subvolume metadata and flags, listing all
7//! subvolumes in a filesystem, and getting or setting the default subvolume
8//! that is mounted when no subvolume is explicitly requested.
9
10use crate::{
11    field_size,
12    raw::{
13        BTRFS_DIR_ITEM_KEY, BTRFS_FIRST_FREE_OBJECTID, BTRFS_FS_TREE_OBJECTID,
14        BTRFS_LAST_FREE_OBJECTID, BTRFS_ROOT_BACKREF_KEY, BTRFS_ROOT_ITEM_KEY,
15        BTRFS_ROOT_TREE_DIR_OBJECTID, BTRFS_ROOT_TREE_OBJECTID,
16        BTRFS_SUBVOL_QGROUP_INHERIT, BTRFS_SUBVOL_RDONLY,
17        BTRFS_SUBVOL_SPEC_BY_ID, BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE,
18        BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED, btrfs_ioc_default_subvol,
19        btrfs_ioc_get_subvol_info, btrfs_ioc_ino_lookup,
20        btrfs_ioc_snap_create_v2, btrfs_ioc_snap_destroy_v2,
21        btrfs_ioc_subvol_create_v2, btrfs_ioc_subvol_getflags,
22        btrfs_ioc_subvol_setflags, btrfs_ioc_subvol_sync_wait,
23        btrfs_ioctl_get_subvol_info_args, btrfs_ioctl_ino_lookup_args,
24        btrfs_ioctl_subvol_wait, btrfs_ioctl_vol_args_v2, btrfs_qgroup_inherit,
25        btrfs_root_item, btrfs_timespec,
26    },
27    tree_search::{SearchKey, tree_search},
28    util::{read_le_u32, read_le_u64},
29};
30use bitflags::bitflags;
31use nix::libc::c_char;
32use std::{
33    collections::HashMap,
34    ffi::CStr,
35    mem,
36    os::{fd::AsRawFd, unix::io::BorrowedFd},
37    time::{Duration, SystemTime, UNIX_EPOCH},
38};
39use uuid::Uuid;
40
41/// The top-level subvolume (FS tree); objectid 5, always present.
42///
43/// Returned by [`subvolume_default_get`] when no explicit default has been set.
44pub const FS_TREE_OBJECTID: u64 = BTRFS_FS_TREE_OBJECTID as u64;
45
46bitflags! {
47    /// Flags on a btrfs subvolume (the `flags` field of the root item /
48    /// `BTRFS_IOC_SUBVOL_{GET,SET}FLAGS`).
49    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
50    pub struct SubvolumeFlags: u64 {
51        /// Subvolume is read-only.
52        const RDONLY = BTRFS_SUBVOL_RDONLY as u64;
53    }
54}
55
56impl std::fmt::Display for SubvolumeFlags {
57    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
58        if self.contains(SubvolumeFlags::RDONLY) {
59            write!(f, "readonly")
60        } else {
61            write!(f, "-")
62        }
63    }
64}
65
66/// Subvolume metadata returned by `BTRFS_IOC_GET_SUBVOL_INFO`.
67#[derive(Debug, Clone)]
68pub struct SubvolumeInfo {
69    /// Root ID (subvolume ID) of this subvolume.
70    pub id: u64,
71    /// Name of this subvolume within its parent directory.
72    pub name: String,
73    /// Root ID of the parent subvolume.
74    pub parent_id: u64,
75    /// Inode number of the directory within the parent that holds this subvolume.
76    pub dir_id: u64,
77    /// Current generation of the subvolume.
78    pub generation: u64,
79    /// Subvolume flags (e.g. read-only).
80    pub flags: SubvolumeFlags,
81    /// UUID of this subvolume.
82    pub uuid: Uuid,
83    /// UUID of the parent subvolume (non-nil for snapshots).
84    pub parent_uuid: Uuid,
85    /// UUID of the received subvolume (non-nil for received snapshots).
86    pub received_uuid: Uuid,
87    /// Transaction ID when the subvolume was last changed.
88    pub ctransid: u64,
89    /// Transaction ID when the subvolume was created.
90    pub otransid: u64,
91    /// Transaction ID when a send was last performed.
92    pub stransid: u64,
93    /// Transaction ID when this subvolume was last received.
94    pub rtransid: u64,
95    /// Time of last change.
96    pub ctime: SystemTime,
97    /// Creation time.
98    pub otime: SystemTime,
99    /// Time of last send.
100    pub stime: SystemTime,
101    /// Time of last receive.
102    pub rtime: SystemTime,
103}
104
105/// A single subvolume entry returned by [`subvolume_list`].
106#[derive(Debug, Clone)]
107pub struct SubvolumeListItem {
108    /// Root ID (subvolume ID).
109    pub root_id: u64,
110    /// Root ID of the parent subvolume (`0` if unknown / not found).
111    pub parent_id: u64,
112    /// Inode of the directory in the parent that contains this subvolume.
113    pub dir_id: u64,
114    /// Current generation.
115    pub generation: u64,
116    /// Subvolume flags.
117    pub flags: SubvolumeFlags,
118    /// UUID of this subvolume.
119    pub uuid: Uuid,
120    /// UUID of the parent subvolume.
121    pub parent_uuid: Uuid,
122    /// UUID of the received subvolume.
123    pub received_uuid: Uuid,
124    /// Transaction ID when created.
125    pub otransid: u64,
126    /// Creation time.
127    pub otime: SystemTime,
128    /// Subvolume name (the leaf name within its parent).
129    ///
130    /// Full path resolution relative to the filesystem root requires
131    /// `BTRFS_IOC_INO_LOOKUP` and is not yet implemented; this field contains
132    /// only the leaf name as stored in the root backref item.
133    pub name: String,
134}
135
136/// Write `name` into the `name` union member of a zeroed
137/// `btrfs_ioctl_vol_args_v2`, returning `ENAMETOOLONG` if it does not fit.
138fn set_v2_name(
139    args: &mut btrfs_ioctl_vol_args_v2,
140    name: &CStr,
141) -> nix::Result<()> {
142    let bytes = name.to_bytes(); // excludes nul terminator
143    // SAFETY: name is the active union member; the struct is already zeroed so
144    // the implicit nul terminator is already present.
145    let name_buf: &mut [c_char] = unsafe { &mut args.__bindgen_anon_2.name };
146    if bytes.len() >= name_buf.len() {
147        return Err(nix::errno::Errno::ENAMETOOLONG);
148    }
149    for (i, &b) in bytes.iter().enumerate() {
150        name_buf[i] = b as c_char;
151    }
152    Ok(())
153}
154
155/// Build a `btrfs_qgroup_inherit` buffer for the given qgroup IDs.
156///
157/// The returned `Vec<u64>` is sized to hold the base struct plus the trailing
158/// `qgroups[]` array, with 8-byte alignment guaranteed by the `u64` element
159/// type.
160fn build_qgroup_inherit(qgroups: &[u64]) -> Vec<u64> {
161    let base_size = mem::size_of::<btrfs_qgroup_inherit>();
162    let total_size = base_size + std::mem::size_of_val(qgroups);
163    let num_u64 = total_size.div_ceil(8);
164    let mut buf = vec![0u64; num_u64];
165
166    // SAFETY: buf is large enough and zeroed; we write through a properly
167    // aligned pointer (btrfs_qgroup_inherit has 8-byte alignment).
168    let inherit =
169        unsafe { &mut *(buf.as_mut_ptr() as *mut btrfs_qgroup_inherit) };
170    inherit.num_qgroups = qgroups.len() as u64;
171
172    // Write the qgroup IDs into the flexible array member.
173    if !qgroups.is_empty() {
174        let array = unsafe { inherit.qgroups.as_mut_slice(qgroups.len()) };
175        array.copy_from_slice(qgroups);
176    }
177
178    buf
179}
180
181/// Set the `BTRFS_SUBVOL_QGROUP_INHERIT` fields on a `vol_args_v2` struct.
182///
183/// `buf` must be the buffer returned by `build_qgroup_inherit`.
184fn set_qgroup_inherit(
185    args: &mut btrfs_ioctl_vol_args_v2,
186    buf: &[u64],
187    num_qgroups: usize,
188) {
189    args.flags |= BTRFS_SUBVOL_QGROUP_INHERIT as u64;
190    let base_size = mem::size_of::<btrfs_qgroup_inherit>();
191    let total_size = base_size + num_qgroups * mem::size_of::<u64>();
192    args.__bindgen_anon_1.__bindgen_anon_1.size = total_size as u64;
193    args.__bindgen_anon_1.__bindgen_anon_1.qgroup_inherit =
194        buf.as_ptr() as *mut btrfs_qgroup_inherit;
195}
196
197/// Create a new subvolume named `name` inside the directory referred to by
198/// `parent_fd`.
199///
200/// `name` must be a plain leaf name (no slashes).  The caller is responsible
201/// for opening the correct parent directory.  If `qgroups` is non-empty, the
202/// new subvolume is added to those qgroups.  Requires `CAP_SYS_ADMIN`.
203///
204/// Errors: ENAMETOOLONG if `name` does not fit in the 4040-byte kernel
205/// buffer.  EEXIST if a subvolume or directory with that name already exists.
206/// EPERM without `CAP_SYS_ADMIN`.
207pub fn subvolume_create(
208    parent_fd: BorrowedFd,
209    name: &CStr,
210    qgroups: &[u64],
211) -> nix::Result<()> {
212    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
213    set_v2_name(&mut args, name)?;
214
215    let inherit_buf;
216    if !qgroups.is_empty() {
217        inherit_buf = build_qgroup_inherit(qgroups);
218        set_qgroup_inherit(&mut args, &inherit_buf, qgroups.len());
219    }
220
221    unsafe { btrfs_ioc_subvol_create_v2(parent_fd.as_raw_fd(), &args) }?;
222    Ok(())
223}
224
225/// Delete the subvolume or snapshot named `name` from the directory referred
226/// to by `parent_fd`.
227///
228/// `name` must be a plain leaf name (no slashes).  Requires `CAP_SYS_ADMIN`.
229///
230/// Deletion is asynchronous: the ioctl removes the directory entry
231/// immediately, but the kernel cleaner thread reclaims the on-disk data
232/// in the background. Until the next transaction commit the deletion is
233/// not visible to other operations (e.g. `subvolume_list` still shows
234/// the subvolume). Call `sync` to force a commit, or pass
235/// `-c`/`--commit-after` at the CLI level. To wait for the cleaner to
236/// finish, use [`subvol_sync_wait_one`].
237pub fn subvolume_delete(parent_fd: BorrowedFd, name: &CStr) -> nix::Result<()> {
238    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
239    set_v2_name(&mut args, name)?;
240    unsafe { btrfs_ioc_snap_destroy_v2(parent_fd.as_raw_fd(), &args) }?;
241    Ok(())
242}
243
244/// Delete a subvolume by its numeric subvolume ID.
245///
246/// `fd` must be an open file descriptor on the filesystem (typically the mount
247/// point).  Unlike `subvolume_delete`, this does not require knowing the
248/// subvolume's path.  Requires `CAP_SYS_ADMIN`.
249///
250/// See [`subvolume_delete`] for details on commit visibility and async
251/// cleanup.
252pub fn subvolume_delete_by_id(
253    fd: BorrowedFd,
254    subvolid: u64,
255) -> nix::Result<()> {
256    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
257    args.flags = BTRFS_SUBVOL_SPEC_BY_ID as u64;
258    args.__bindgen_anon_2.subvolid = subvolid;
259    unsafe { btrfs_ioc_snap_destroy_v2(fd.as_raw_fd(), &args) }?;
260    Ok(())
261}
262
263/// Create a snapshot of the subvolume referred to by `source_fd`, placing it
264/// as `name` inside the directory referred to by `parent_fd`.
265///
266/// If `readonly` is `true` the new snapshot is created read-only.  If
267/// `qgroups` is non-empty, the new snapshot is added to those qgroups.
268/// Requires `CAP_SYS_ADMIN`.
269///
270/// Errors: ENAMETOOLONG if `name` does not fit in the 4040-byte kernel
271/// buffer.  EEXIST if a subvolume or directory with that name already exists.
272/// EROFS if `parent_fd` refers to a read-only subvolume.  EPERM without
273/// `CAP_SYS_ADMIN`.
274pub fn snapshot_create(
275    parent_fd: BorrowedFd,
276    source_fd: BorrowedFd,
277    name: &CStr,
278    readonly: bool,
279    qgroups: &[u64],
280) -> nix::Result<()> {
281    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
282    // The `fd` field carries the source subvolume file descriptor.
283    args.fd = source_fd.as_raw_fd() as i64;
284    if readonly {
285        args.flags = BTRFS_SUBVOL_RDONLY as u64;
286    }
287    set_v2_name(&mut args, name)?;
288
289    let inherit_buf;
290    if !qgroups.is_empty() {
291        inherit_buf = build_qgroup_inherit(qgroups);
292        set_qgroup_inherit(&mut args, &inherit_buf, qgroups.len());
293    }
294
295    unsafe { btrfs_ioc_snap_create_v2(parent_fd.as_raw_fd(), &args) }?;
296    Ok(())
297}
298
299/// Query detailed information about the subvolume that `fd` belongs to.
300///
301/// `fd` can be any file or directory within the target subvolume.
302/// Does not require elevated privileges.
303pub fn subvolume_info(fd: BorrowedFd) -> nix::Result<SubvolumeInfo> {
304    subvolume_info_by_id(fd, 0)
305}
306
307/// Query detailed information about a subvolume by its numeric root ID.
308///
309/// `fd` can be any open file descriptor on the filesystem.  If `rootid` is 0,
310/// the subvolume that `fd` belongs to is queried (equivalent to
311/// `subvolume_info`).  Does not require elevated privileges.
312///
313/// Errors: ENOENT if no subvolume with that `rootid` exists (or has been
314/// deleted but not yet cleaned).
315pub fn subvolume_info_by_id(
316    fd: BorrowedFd,
317    rootid: u64,
318) -> nix::Result<SubvolumeInfo> {
319    let mut raw: btrfs_ioctl_get_subvol_info_args = unsafe { mem::zeroed() };
320    raw.treeid = rootid;
321    unsafe { btrfs_ioc_get_subvol_info(fd.as_raw_fd(), &mut raw) }?;
322
323    let name = unsafe { CStr::from_ptr(raw.name.as_ptr()) }
324        .to_string_lossy()
325        .into_owned();
326
327    Ok(SubvolumeInfo {
328        id: raw.treeid,
329        name,
330        parent_id: raw.parent_id,
331        dir_id: raw.dirid,
332        generation: raw.generation,
333        flags: SubvolumeFlags::from_bits_truncate(raw.flags),
334        uuid: Uuid::from_bytes(raw.uuid),
335        parent_uuid: Uuid::from_bytes(raw.parent_uuid),
336        received_uuid: Uuid::from_bytes(raw.received_uuid),
337        ctransid: raw.ctransid,
338        otransid: raw.otransid,
339        stransid: raw.stransid,
340        rtransid: raw.rtransid,
341        ctime: ioctl_timespec_to_system_time(raw.ctime.sec, raw.ctime.nsec),
342        otime: ioctl_timespec_to_system_time(raw.otime.sec, raw.otime.nsec),
343        stime: ioctl_timespec_to_system_time(raw.stime.sec, raw.stime.nsec),
344        rtime: ioctl_timespec_to_system_time(raw.rtime.sec, raw.rtime.nsec),
345    })
346}
347
348/// Read the flags of the subvolume that `fd` belongs to.
349pub fn subvolume_flags_get(fd: BorrowedFd) -> nix::Result<SubvolumeFlags> {
350    let mut flags: u64 = 0;
351    unsafe { btrfs_ioc_subvol_getflags(fd.as_raw_fd(), &mut flags) }?;
352    Ok(SubvolumeFlags::from_bits_truncate(flags))
353}
354
355/// Set the flags of the subvolume that `fd` belongs to.
356///
357/// Requires `CAP_SYS_ADMIN` to make a subvolume read-only; any user may
358/// clear the read-only flag from a subvolume they own.
359pub fn subvolume_flags_set(
360    fd: BorrowedFd,
361    flags: SubvolumeFlags,
362) -> nix::Result<()> {
363    let raw: u64 = flags.bits();
364    unsafe { btrfs_ioc_subvol_setflags(fd.as_raw_fd(), &raw) }?;
365    Ok(())
366}
367
368/// Query the ID of the default subvolume of the filesystem referred to by
369/// `fd`.
370///
371/// Searches the root tree for the `BTRFS_DIR_ITEM_KEY` entry at objectid
372/// `BTRFS_ROOT_TREE_DIR_OBJECTID` that stores the default subvolume ID.
373/// Returns [`FS_TREE_OBJECTID`] if no default has been set.
374///
375/// Requires `CAP_SYS_ADMIN`.
376pub fn subvolume_default_get(fd: BorrowedFd) -> nix::Result<u64> {
377    let mut default_id: Option<u64> = None;
378
379    tree_search(
380        fd,
381        SearchKey::for_objectid_range(
382            BTRFS_ROOT_TREE_OBJECTID as u64,
383            BTRFS_DIR_ITEM_KEY,
384            BTRFS_ROOT_TREE_DIR_OBJECTID as u64,
385            BTRFS_ROOT_TREE_DIR_OBJECTID as u64,
386        ),
387        |_hdr, data| {
388            use crate::raw::btrfs_dir_item;
389            use std::mem::{offset_of, size_of};
390
391            let header_size = size_of::<btrfs_dir_item>();
392            if data.len() < header_size {
393                return Ok(());
394            }
395            let name_off = offset_of!(btrfs_dir_item, name_len);
396            let name_len =
397                u16::from_le_bytes([data[name_off], data[name_off + 1]])
398                    as usize;
399            if data.len() < header_size + name_len {
400                return Ok(());
401            }
402            let item_name = &data[header_size..header_size + name_len];
403            if item_name == b"default" {
404                let loc_off = offset_of!(btrfs_dir_item, location);
405                let target_id = u64::from_le_bytes(
406                    data[loc_off..loc_off + 8].try_into().unwrap(),
407                );
408                default_id = Some(target_id);
409            }
410            Ok(())
411        },
412    )?;
413
414    Ok(default_id.unwrap_or(BTRFS_FS_TREE_OBJECTID as u64))
415}
416
417/// Set the default subvolume of the filesystem referred to by `fd` to
418/// `subvolid`.
419///
420/// Pass [`FS_TREE_OBJECTID`] to restore the default.  Requires `CAP_SYS_ADMIN`.
421pub fn subvolume_default_set(fd: BorrowedFd, subvolid: u64) -> nix::Result<()> {
422    unsafe { btrfs_ioc_default_subvol(fd.as_raw_fd(), &subvolid) }?;
423    Ok(())
424}
425
426/// List all user subvolumes and snapshots in the filesystem referred to by
427/// `fd` by walking the root tree.
428///
429/// Two tree-search passes are made:
430/// 1. `ROOT_ITEM_KEY` — reads each subvolume's metadata (generation, flags,
431///    UUIDs, creation time).
432/// 2. `BTRFS_ROOT_BACKREF_KEY` — reads each subvolume's parent ID and leaf name.
433///
434/// Subvolumes for which no backref is found are still included; their
435/// `parent_id`, `dir_id`, and `name` will be zeroed / empty.
436///
437/// Requires `CAP_SYS_ADMIN` for the tree search. Without it the kernel
438/// returns EPERM; the caller should degrade gracefully (e.g. show only the
439/// leaf name without full path resolution).
440pub fn subvolume_list(fd: BorrowedFd) -> nix::Result<Vec<SubvolumeListItem>> {
441    let mut items: Vec<SubvolumeListItem> = Vec::new();
442
443    tree_search(
444        fd,
445        SearchKey::for_objectid_range(
446            BTRFS_ROOT_TREE_OBJECTID as u64,
447            BTRFS_ROOT_ITEM_KEY,
448            BTRFS_FIRST_FREE_OBJECTID as u64,
449            BTRFS_LAST_FREE_OBJECTID as u64,
450        ),
451        |hdr, data| {
452            if let Some(item) = parse_root_item(hdr.objectid, data) {
453                items.push(item);
454            }
455            Ok(())
456        },
457    )?;
458
459    tree_search(
460        fd,
461        SearchKey::for_objectid_range(
462            BTRFS_ROOT_TREE_OBJECTID as u64,
463            BTRFS_ROOT_BACKREF_KEY,
464            BTRFS_FIRST_FREE_OBJECTID as u64,
465            BTRFS_LAST_FREE_OBJECTID as u64,
466        ),
467        |hdr, data| {
468            // ROOT_BACKREF_KEY: objectid = subvol root_id, offset = parent root_id
469            let root_id = hdr.objectid;
470            let parent_id = hdr.offset;
471
472            if let Some(item) = items.iter_mut().find(|i| i.root_id == root_id)
473            {
474                // Only take the first ROOT_BACKREF for each subvolume.  A
475                // subvolume can have multiple ROOT_BACKREF entries when it is
476                // referenced from more than one parent (e.g. the subvolume
477                // also appears as a snapshot inside another subvolume).
478                // Items are returned in offset-ascending order, so the first
479                // entry has the smallest parent_id — which is the canonical
480                // location btrfs-progs uses for "top level" and path.
481                if item.parent_id == 0 {
482                    item.parent_id = parent_id;
483                    if let Some((dir_id, name)) = parse_root_ref(data) {
484                        item.dir_id = dir_id;
485                        item.name = name;
486                    }
487                }
488            }
489            Ok(())
490        },
491    )?;
492
493    // Determine which subvolume the fd is open on.  Paths are expressed
494    // relative to this subvolume, matching btrfs-progs behaviour.
495    let top_id =
496        crate::inode::lookup_path_rootid(fd).unwrap_or(FS_TREE_OBJECTID);
497
498    resolve_full_paths(fd, &mut items, top_id)?;
499
500    Ok(items)
501}
502
503/// Call `BTRFS_IOC_INO_LOOKUP` for `dir_id` within `parent_tree` and return
504/// the path from that tree's root to the directory.
505///
506/// The kernel returns the path with a trailing `/` when the directory is not
507/// the tree root, and an empty string when `dir_id` is the tree root itself.
508/// This prefix can be concatenated directly with the subvolume's leaf name to
509/// form the full segment within the parent.
510fn ino_lookup_dir_path(
511    fd: BorrowedFd,
512    parent_tree: u64,
513    dir_id: u64,
514) -> nix::Result<String> {
515    let mut args = btrfs_ioctl_ino_lookup_args {
516        treeid: parent_tree,
517        objectid: dir_id,
518        ..unsafe { mem::zeroed() }
519    };
520    // SAFETY: args is a valid, zeroed btrfs_ioctl_ino_lookup_args; the ioctl
521    // fills in args.name with a null-terminated path string.
522    unsafe { btrfs_ioc_ino_lookup(fd.as_raw_fd(), &mut args) }?;
523
524    // args.name is [c_char; 4080]; find the null terminator and decode.
525    let name_ptr: *const c_char = args.name.as_ptr();
526    // SAFETY: the ioctl guarantees null termination within the 4080-byte buffer.
527    let cstr = unsafe { CStr::from_ptr(name_ptr) };
528    Ok(cstr.to_string_lossy().into_owned())
529}
530
531/// Resolve the `name` field of every item in `items` from a bare leaf name to
532/// the full path relative to the filesystem root.
533///
534/// For each subvolume we already have `parent_id`, `dir_id`, and the leaf name
535/// from the ROOT_BACKREF pass.  A single `BTRFS_IOC_INO_LOOKUP` call per item
536/// gives the path from the parent tree's root down to the directory that
537/// contains the subvolume (the "dir prefix").  Concatenating that prefix with
538/// the leaf name yields the subvolume's segment within its parent.  Walking up
539/// the parent chain (using the in-memory items map) and joining segments with
540/// `/` gives the final full path.
541fn resolve_full_paths(
542    fd: BorrowedFd,
543    items: &mut [SubvolumeListItem],
544    top_id: u64,
545) -> nix::Result<()> {
546    // Map root_id → index for O(1) parent lookups.
547    let id_to_idx: HashMap<u64, usize> = items
548        .iter()
549        .enumerate()
550        .map(|(i, item)| (item.root_id, i))
551        .collect();
552
553    // Compute the "segment" for each item: the path of this subvolume within
554    // its immediate parent (dir prefix from INO_LOOKUP + leaf name).
555    // If INO_LOOKUP is not available (e.g. missing CAP_SYS_ADMIN), fall back
556    // to the bare leaf name so the list still works.
557    let segments: Vec<String> = items
558        .iter()
559        .map(|item| {
560            if item.parent_id == 0 || item.name.is_empty() {
561                return item.name.clone();
562            }
563            match ino_lookup_dir_path(fd, item.parent_id, item.dir_id) {
564                Ok(prefix) => format!("{}{}", prefix, item.name),
565                Err(_) => item.name.clone(),
566            }
567        })
568        .collect();
569
570    // Walk each item's parent chain, joining segments, and cache results so
571    // every ancestor is resolved at most once.
572    let mut full_paths: HashMap<u64, String> = HashMap::new();
573    let root_ids: Vec<u64> = items.iter().map(|i| i.root_id).collect();
574    for root_id in root_ids {
575        build_full_path(
576            root_id,
577            top_id,
578            &id_to_idx,
579            &segments,
580            items,
581            &mut full_paths,
582        );
583    }
584
585    for item in items.iter_mut() {
586        if let Some(path) = full_paths.remove(&item.root_id) {
587            item.name = path;
588        }
589    }
590
591    Ok(())
592}
593
594/// Compute the full path for `root_id`, memoizing into `cache`.
595///
596/// Walks the ancestor chain iteratively to avoid stack overflow on deep
597/// subvolume trees.  Collects segments from the target up to the FS tree
598/// root, then joins them in reverse order.
599///
600/// Cycle detection is included: ROOT_BACKREF entries can form mutual parent
601/// relationships (e.g. a snapshot stored inside the subvolume it was taken
602/// from), which would otherwise loop forever.
603fn build_full_path(
604    root_id: u64,
605    top_id: u64,
606    id_to_idx: &HashMap<u64, usize>,
607    segments: &[String],
608    items: &[SubvolumeListItem],
609    cache: &mut HashMap<u64, String>,
610) -> String {
611    // Collect the chain of root_ids from `root_id` up to the top subvolume
612    // (the one the fd is open on) or the FS tree root, stopping early if we
613    // hit an already-cached ancestor or a cycle.
614    let mut chain: Vec<u64> = Vec::new();
615    let mut visited: HashMap<u64, usize> = HashMap::new();
616    let mut cur = root_id;
617    loop {
618        if cache.contains_key(&cur) {
619            break;
620        }
621        if visited.contains_key(&cur) {
622            // Cycle detected: truncate the chain back to where the cycle
623            // starts so we don't join nonsensical repeated segments.
624            let cycle_start = visited[&cur];
625            chain.truncate(cycle_start);
626            break;
627        }
628        let Some(&idx) = id_to_idx.get(&cur) else {
629            break;
630        };
631        visited.insert(cur, chain.len());
632        chain.push(cur);
633        let parent = items[idx].parent_id;
634        if parent == 0
635            || parent == FS_TREE_OBJECTID
636            || parent == top_id
637            || !id_to_idx.contains_key(&parent)
638        {
639            break;
640        }
641        cur = parent;
642    }
643
644    // Resolve each node in the chain from root toward the target, building
645    // on any already-cached prefix we stopped at.
646    for &id in chain.iter().rev() {
647        let Some(&idx) = id_to_idx.get(&id) else {
648            cache.insert(id, String::new());
649            continue;
650        };
651        let segment = &segments[idx];
652        let parent_id = items[idx].parent_id;
653
654        let full_path = if parent_id == 0
655            || parent_id == FS_TREE_OBJECTID
656            || parent_id == top_id
657            || !id_to_idx.contains_key(&parent_id)
658        {
659            segment.clone()
660        } else if let Some(parent_path) = cache.get(&parent_id) {
661            if parent_path.is_empty() {
662                segment.clone()
663            } else {
664                format!("{}/{}", parent_path, segment)
665            }
666        } else {
667            segment.clone()
668        };
669
670        cache.insert(id, full_path);
671    }
672
673    cache.get(&root_id).cloned().unwrap_or_default()
674}
675
676/// `btrfs_root_item` field offsets (packed, LE).
677fn parse_root_item(root_id: u64, data: &[u8]) -> Option<SubvolumeListItem> {
678    use std::mem::offset_of;
679
680    // Items shorter than generation_v2 are "legacy" and do not carry
681    // UUID / otime / otransid fields.
682    let legacy_boundary = offset_of!(btrfs_root_item, generation_v2);
683    if data.len() < legacy_boundary {
684        return None;
685    }
686
687    let generation = read_le_u64(data, offset_of!(btrfs_root_item, generation));
688    let flags_raw = read_le_u64(data, offset_of!(btrfs_root_item, flags));
689    let flags = SubvolumeFlags::from_bits_truncate(flags_raw);
690
691    // Extended fields exist only in non-legacy items.
692    let otime_nsec =
693        offset_of!(btrfs_root_item, otime) + offset_of!(btrfs_timespec, nsec);
694    let (uuid, parent_uuid, received_uuid, otransid, otime) = if data.len()
695        >= otime_nsec + field_size!(btrfs_timespec, nsec)
696    {
697        let off_uuid = offset_of!(btrfs_root_item, uuid);
698        let off_parent = offset_of!(btrfs_root_item, parent_uuid);
699        let off_received = offset_of!(btrfs_root_item, received_uuid);
700        let uuid_size = field_size!(btrfs_root_item, uuid);
701        let uuid = Uuid::from_bytes(
702            data[off_uuid..off_uuid + uuid_size].try_into().unwrap(),
703        );
704        let parent_uuid = Uuid::from_bytes(
705            data[off_parent..off_parent + uuid_size].try_into().unwrap(),
706        );
707        let received_uuid = Uuid::from_bytes(
708            data[off_received..off_received + uuid_size]
709                .try_into()
710                .unwrap(),
711        );
712        let otransid = read_le_u64(data, offset_of!(btrfs_root_item, otransid));
713        let otime_sec = offset_of!(btrfs_root_item, otime);
714        let otime = timespec_to_system_time(
715            read_le_u64(data, otime_sec),
716            read_le_u32(data, otime_nsec),
717        );
718        (uuid, parent_uuid, received_uuid, otransid, otime)
719    } else {
720        (Uuid::nil(), Uuid::nil(), Uuid::nil(), 0, UNIX_EPOCH)
721    };
722
723    Some(SubvolumeListItem {
724        root_id,
725        parent_id: 0,
726        dir_id: 0,
727        generation,
728        flags,
729        uuid,
730        parent_uuid,
731        received_uuid,
732        otransid,
733        otime,
734        name: String::new(),
735    })
736}
737
738/// Parse a `btrfs_root_ref` payload (packed, LE). The name immediately
739/// follows the fixed-size header.
740fn parse_root_ref(data: &[u8]) -> Option<(u64, String)> {
741    use crate::raw::btrfs_root_ref;
742    use std::mem::{offset_of, size_of};
743
744    let header_size = size_of::<btrfs_root_ref>();
745    if data.len() < header_size {
746        return None;
747    }
748    let dir_id = read_le_u64(data, offset_of!(btrfs_root_ref, dirid));
749    let name_off = offset_of!(btrfs_root_ref, name_len);
750    let name_len =
751        u16::from_le_bytes([data[name_off], data[name_off + 1]]) as usize;
752    if data.len() < header_size + name_len {
753        return None;
754    }
755    let name =
756        String::from_utf8_lossy(&data[header_size..header_size + name_len])
757            .into_owned();
758    Some((dir_id, name))
759}
760
761/// Convert an on-disk `btrfs_timespec` (LE sec + LE nsec, packed) to
762/// [`SystemTime`].  Returns [`UNIX_EPOCH`] if sec == 0.
763fn timespec_to_system_time(sec: u64, nsec: u32) -> SystemTime {
764    if sec == 0 {
765        return UNIX_EPOCH;
766    }
767    UNIX_EPOCH + Duration::new(sec, nsec)
768}
769
770/// Convert a `btrfs_ioctl_timespec` (host byte order, with padding) to
771/// [`SystemTime`].  Returns [`UNIX_EPOCH`] if sec == 0.
772fn ioctl_timespec_to_system_time(sec: u64, nsec: u32) -> SystemTime {
773    if sec == 0 {
774        return UNIX_EPOCH;
775    }
776    UNIX_EPOCH + Duration::new(sec, nsec)
777}
778
779/// A child subvolume reference returned by [`subvol_rootrefs`].
780#[derive(Debug, Clone, Copy, PartialEq, Eq)]
781pub struct SubvolRootRef {
782    /// Root ID (tree ID) of the child subvolume.
783    pub treeid: u64,
784    /// Directory inode ID where the child is attached in the parent.
785    pub dirid: u64,
786}
787
788/// List the child subvolumes directly under the subvolume opened by `fd`.
789///
790/// Returns all subvolumes whose root is referenced from the given
791/// subvolume. The kernel returns results in batches of up to 255; this
792/// function handles continuation automatically by advancing `min_treeid`.
793///
794/// Does not require `CAP_SYS_ADMIN`.
795///
796/// Errors: ENOTTY on kernels older than 4.18.
797pub fn subvol_rootrefs(fd: BorrowedFd) -> nix::Result<Vec<SubvolRootRef>> {
798    use crate::raw::{
799        btrfs_ioc_get_subvol_rootref, btrfs_ioctl_get_subvol_rootref_args,
800    };
801
802    let mut results = Vec::new();
803    let mut min_treeid: u64 = 0;
804
805    loop {
806        let mut args: btrfs_ioctl_get_subvol_rootref_args =
807            unsafe { std::mem::zeroed() };
808        args.min_treeid = min_treeid;
809
810        let ret =
811            unsafe { btrfs_ioc_get_subvol_rootref(fd.as_raw_fd(), &mut args) };
812
813        // The kernel returns EOVERFLOW when there are more results than
814        // fit in one batch. We read what we got and loop with the updated
815        // min_treeid.
816        let overflow = match ret {
817            Ok(_) => false,
818            Err(nix::errno::Errno::EOVERFLOW) => true,
819            Err(e) => return Err(e),
820        };
821
822        let count = args.num_items as usize;
823        for i in 0..count {
824            let r = &args.rootref[i];
825            results.push(SubvolRootRef {
826                treeid: r.treeid,
827                dirid: r.dirid,
828            });
829        }
830
831        if !overflow || count == 0 {
832            break;
833        }
834
835        // Advance past the last returned treeid for the next batch.
836        min_treeid = args.rootref[count - 1].treeid + 1;
837    }
838
839    Ok(results)
840}
841
842/// Wait for a specific deleted subvolume to be fully cleaned by the kernel.
843///
844/// Blocks until the background cleaner has finished removing the on-disk
845/// data for the given subvolume ID. Returns `Ok(())` both when the wait
846/// completes and when the subvolume is already gone (`ENOENT`).
847/// Useful after `subvolume_delete` when subsequent operations depend on
848/// the subvolume being fully gone (e.g. qgroup staleness checks).
849pub fn subvol_sync_wait_one(fd: BorrowedFd, subvolid: u64) -> nix::Result<()> {
850    let args = btrfs_ioctl_subvol_wait {
851        subvolid,
852        mode: BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE,
853        count: 0,
854    };
855    match unsafe { btrfs_ioc_subvol_sync_wait(fd.as_raw_fd(), &args) } {
856        Ok(_) | Err(nix::errno::Errno::ENOENT) => Ok(()),
857        Err(e) => Err(e),
858    }
859}
860
861/// Wait for all currently queued subvolume deletions to complete.
862///
863/// Blocks until every subvolume that was in the deletion queue at the time
864/// of the call has been fully cleaned. Does not wait for subvolumes
865/// deleted after the call is made.
866pub fn subvol_sync_wait_all(fd: BorrowedFd) -> nix::Result<()> {
867    let args = btrfs_ioctl_subvol_wait {
868        subvolid: 0,
869        mode: BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED,
870        count: 0,
871    };
872    unsafe { btrfs_ioc_subvol_sync_wait(fd.as_raw_fd(), &args) }?;
873    Ok(())
874}
875
876#[cfg(test)]
877mod tests {
878    use super::*;
879    use std::{
880        collections::HashMap,
881        time::{Duration, UNIX_EPOCH},
882    };
883    use uuid::Uuid;
884
885    fn test_item(root_id: u64, parent_id: u64) -> SubvolumeListItem {
886        SubvolumeListItem {
887            root_id,
888            parent_id,
889            dir_id: 0,
890            generation: 0,
891            flags: SubvolumeFlags::empty(),
892            uuid: Uuid::nil(),
893            parent_uuid: Uuid::nil(),
894            received_uuid: Uuid::nil(),
895            otransid: 0,
896            otime: UNIX_EPOCH,
897            name: String::new(),
898        }
899    }
900
901    #[test]
902    fn timespec_zero_returns_epoch() {
903        assert_eq!(timespec_to_system_time(0, 0), UNIX_EPOCH);
904    }
905
906    #[test]
907    fn timespec_zero_sec_with_nonzero_nsec_returns_epoch() {
908        // sec==0 triggers the early return regardless of nsec
909        assert_eq!(timespec_to_system_time(0, 500_000_000), UNIX_EPOCH);
910    }
911
912    #[test]
913    fn timespec_nonzero_returns_correct_time() {
914        let t = timespec_to_system_time(1000, 500);
915        assert_eq!(t, UNIX_EPOCH + Duration::new(1000, 500));
916    }
917
918    #[test]
919    fn subvolume_flags_display_readonly() {
920        let flags = SubvolumeFlags::RDONLY;
921        assert_eq!(format!("{}", flags), "readonly");
922    }
923
924    #[test]
925    fn subvolume_flags_display_empty() {
926        let flags = SubvolumeFlags::empty();
927        assert_eq!(format!("{}", flags), "-");
928    }
929
930    #[test]
931    fn parse_root_ref_valid() {
932        // btrfs_root_ref: dirid (8 LE) + sequence (8 LE) + name_len (2 LE) + name bytes
933        let name = b"mysubvol";
934        let mut buf = Vec::new();
935        buf.extend_from_slice(&42u64.to_le_bytes()); // dirid
936        buf.extend_from_slice(&1u64.to_le_bytes()); // sequence
937        buf.extend_from_slice(&(name.len() as u16).to_le_bytes()); // name_len
938        buf.extend_from_slice(name);
939
940        let result = parse_root_ref(&buf);
941        assert!(result.is_some());
942        let (dir_id, parsed_name) = result.unwrap();
943        assert_eq!(dir_id, 42);
944        assert_eq!(parsed_name, "mysubvol");
945    }
946
947    #[test]
948    fn parse_root_ref_too_short_header() {
949        // Less than 18 bytes (sizeof btrfs_root_ref)
950        let buf = [0u8; 10];
951        assert!(parse_root_ref(&buf).is_none());
952    }
953
954    #[test]
955    fn parse_root_ref_too_short_name() {
956        // Header claims 10-byte name but buffer only has the header
957        let mut buf = vec![0u8; 18];
958        // Set name_len = 10 at offset 16
959        buf[16] = 10;
960        buf[17] = 0;
961        assert!(parse_root_ref(&buf).is_none());
962    }
963
964    #[test]
965    fn parse_root_ref_empty_name() {
966        let mut buf = Vec::new();
967        buf.extend_from_slice(&100u64.to_le_bytes()); // dirid
968        buf.extend_from_slice(&0u64.to_le_bytes()); // sequence
969        buf.extend_from_slice(&0u16.to_le_bytes()); // name_len = 0
970
971        let result = parse_root_ref(&buf);
972        assert!(result.is_some());
973        let (dir_id, parsed_name) = result.unwrap();
974        assert_eq!(dir_id, 100);
975        assert_eq!(parsed_name, "");
976    }
977
978    #[test]
979    fn build_full_path_single_subvol_parent_fs_tree() {
980        // Subvolume 256, parent is FS_TREE (5)
981        let items = vec![test_item(256, FS_TREE_OBJECTID)];
982        let segments = vec!["mysub".to_string()];
983        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
984        let mut cache = HashMap::new();
985
986        let path = build_full_path(
987            256,
988            FS_TREE_OBJECTID,
989            &id_to_idx,
990            &segments,
991            &items,
992            &mut cache,
993        );
994        assert_eq!(path, "mysub");
995    }
996
997    #[test]
998    fn build_full_path_nested_chain() {
999        // A (256) -> B (257) -> C (258), all parented under FS_TREE
1000        let items = vec![
1001            test_item(256, FS_TREE_OBJECTID),
1002            test_item(257, 256),
1003            test_item(258, 257),
1004        ];
1005        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1006        let id_to_idx: HashMap<u64, usize> =
1007            [(256, 0), (257, 1), (258, 2)].into();
1008        let mut cache = HashMap::new();
1009
1010        let path = build_full_path(
1011            258,
1012            FS_TREE_OBJECTID,
1013            &id_to_idx,
1014            &segments,
1015            &items,
1016            &mut cache,
1017        );
1018        assert_eq!(path, "A/B/C");
1019    }
1020
1021    #[test]
1022    fn build_full_path_stops_at_top_id() {
1023        // A (256) -> B (257) -> C (258), top_id = 257 (B)
1024        // Paths are relative to top_id, so C's parent (257) == top_id means
1025        // C's path is just its own segment, not "A/B/C".
1026        let items = vec![
1027            test_item(256, FS_TREE_OBJECTID),
1028            test_item(257, 256),
1029            test_item(258, 257),
1030        ];
1031        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1032        let id_to_idx: HashMap<u64, usize> =
1033            [(256, 0), (257, 1), (258, 2)].into();
1034        let mut cache = HashMap::new();
1035
1036        let path = build_full_path(
1037            258, 257, &id_to_idx, &segments, &items, &mut cache,
1038        );
1039        assert_eq!(path, "C");
1040
1041        // B's path is also just "B" (its parent 256/A is below top_id in the
1042        // tree, but B's own parent is not top_id — A's parent is FS_TREE).
1043        // With top_id=257, building B: parent=256, 256 is in id_to_idx but
1044        // 256's parent is FS_TREE (5) which triggers the stop, so chain = [257, 256],
1045        // and A gets its segment, B gets "A/B".
1046        let path_b = build_full_path(
1047            257, 257, &id_to_idx, &segments, &items, &mut cache,
1048        );
1049        // 257 itself: its parent is 256, 256 != top_id (257), so we walk up.
1050        // 256's parent is FS_TREE (5), which triggers stop. chain = [257, 256].
1051        // 256 resolves to "A" (parent is FS_TREE), 257 resolves to "A/B".
1052        assert_eq!(path_b, "A/B");
1053    }
1054
1055    #[test]
1056    fn build_full_path_cycle_detection() {
1057        // A (256) parent is B (257), B (257) parent is A (256) — mutual cycle
1058        let items = vec![test_item(256, 257), test_item(257, 256)];
1059        let segments = vec!["A".to_string(), "B".to_string()];
1060        let id_to_idx: HashMap<u64, usize> = [(256, 0), (257, 1)].into();
1061        let mut cache = HashMap::new();
1062
1063        // Must not hang. The result is truncated due to cycle detection.
1064        let _path = build_full_path(
1065            256,
1066            FS_TREE_OBJECTID,
1067            &id_to_idx,
1068            &segments,
1069            &items,
1070            &mut cache,
1071        );
1072        // Just verify it terminates and returns something (exact value depends
1073        // on cycle truncation heuristic).
1074    }
1075
1076    #[test]
1077    fn build_full_path_cached_ancestor() {
1078        // A (256) -> B (257) -> C (258)
1079        // Pre-cache B's path; building C should use it.
1080        let items = vec![
1081            test_item(256, FS_TREE_OBJECTID),
1082            test_item(257, 256),
1083            test_item(258, 257),
1084        ];
1085        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1086        let id_to_idx: HashMap<u64, usize> =
1087            [(256, 0), (257, 1), (258, 2)].into();
1088        let mut cache = HashMap::new();
1089        cache.insert(257, "A/B".to_string());
1090
1091        let path = build_full_path(
1092            258,
1093            FS_TREE_OBJECTID,
1094            &id_to_idx,
1095            &segments,
1096            &items,
1097            &mut cache,
1098        );
1099        assert_eq!(path, "A/B/C");
1100    }
1101
1102    #[test]
1103    fn build_full_path_unknown_parent() {
1104        // Subvolume 256, parent 999 not in id_to_idx
1105        let items = vec![test_item(256, 999)];
1106        let segments = vec!["orphan".to_string()];
1107        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1108        let mut cache = HashMap::new();
1109
1110        let path = build_full_path(
1111            256,
1112            FS_TREE_OBJECTID,
1113            &id_to_idx,
1114            &segments,
1115            &items,
1116            &mut cache,
1117        );
1118        assert_eq!(path, "orphan");
1119    }
1120
1121    #[test]
1122    fn build_full_path_parent_id_zero() {
1123        // Subvolume 256, parent_id == 0 (no backref found)
1124        let items = vec![test_item(256, 0)];
1125        let segments = vec!["noparent".to_string()];
1126        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1127        let mut cache = HashMap::new();
1128
1129        let path = build_full_path(
1130            256,
1131            FS_TREE_OBJECTID,
1132            &id_to_idx,
1133            &segments,
1134            &items,
1135            &mut cache,
1136        );
1137        assert_eq!(path, "noparent");
1138    }
1139
1140    #[test]
1141    fn build_full_path_already_cached_target() {
1142        // If the target itself is already cached, return the cached value.
1143        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1144        let segments = vec!["A".to_string()];
1145        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1146        let mut cache = HashMap::new();
1147        cache.insert(256, "cached/path".to_string());
1148
1149        let path = build_full_path(
1150            256,
1151            FS_TREE_OBJECTID,
1152            &id_to_idx,
1153            &segments,
1154            &items,
1155            &mut cache,
1156        );
1157        assert_eq!(path, "cached/path");
1158    }
1159
1160    #[test]
1161    fn build_full_path_root_id_not_in_items() {
1162        // root_id not present in id_to_idx at all
1163        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1164        let segments = vec!["A".to_string()];
1165        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1166        let mut cache = HashMap::new();
1167
1168        let path = build_full_path(
1169            999,
1170            FS_TREE_OBJECTID,
1171            &id_to_idx,
1172            &segments,
1173            &items,
1174            &mut cache,
1175        );
1176        assert_eq!(path, "");
1177    }
1178}