Skip to main content

btrfs_uapi/
subvolume.rs

1//! # Subvolume and snapshot management: creating, deleting, and querying subvolumes
2//!
3//! Subvolumes are independently snapshotable subtrees within a btrfs filesystem.
4//! Snapshots are subvolumes created as copy-on-write clones of an existing
5//! subvolume.  This module covers the full lifecycle: creating and deleting
6//! subvolumes and snapshots, reading subvolume metadata and flags, listing all
7//! subvolumes in a filesystem, and getting or setting the default subvolume
8//! that is mounted when no subvolume is explicitly requested.
9
10use crate::{
11    raw::{
12        BTRFS_DIR_ITEM_KEY, BTRFS_FIRST_FREE_OBJECTID, BTRFS_FS_TREE_OBJECTID,
13        BTRFS_LAST_FREE_OBJECTID, BTRFS_ROOT_BACKREF_KEY, BTRFS_ROOT_ITEM_KEY,
14        BTRFS_ROOT_TREE_DIR_OBJECTID, BTRFS_ROOT_TREE_OBJECTID,
15        BTRFS_SUBVOL_QGROUP_INHERIT, BTRFS_SUBVOL_RDONLY,
16        BTRFS_SUBVOL_SPEC_BY_ID, BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE,
17        BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED, btrfs_ioc_default_subvol,
18        btrfs_ioc_get_subvol_info, btrfs_ioc_ino_lookup,
19        btrfs_ioc_snap_create_v2, btrfs_ioc_snap_destroy_v2,
20        btrfs_ioc_subvol_create_v2, btrfs_ioc_subvol_getflags,
21        btrfs_ioc_subvol_setflags, btrfs_ioc_subvol_sync_wait,
22        btrfs_ioctl_get_subvol_info_args, btrfs_ioctl_ino_lookup_args,
23        btrfs_ioctl_subvol_wait, btrfs_ioctl_vol_args_v2, btrfs_qgroup_inherit,
24    },
25    tree_search::{SearchFilter, tree_search},
26};
27use bitflags::bitflags;
28use nix::libc::c_char;
29use std::{
30    collections::HashMap,
31    ffi::CStr,
32    mem,
33    os::{fd::AsRawFd, unix::io::BorrowedFd},
34    time::{Duration, SystemTime, UNIX_EPOCH},
35};
36use uuid::Uuid;
37
38/// The top-level subvolume (FS tree); objectid 5, always present.
39///
40/// Returned by [`subvolume_default_get`] when no explicit default has been set.
41pub const FS_TREE_OBJECTID: u64 = BTRFS_FS_TREE_OBJECTID as u64;
42
43bitflags! {
44    /// Flags on a btrfs subvolume (the `flags` field of the root item /
45    /// `BTRFS_IOC_SUBVOL_{GET,SET}FLAGS`).
46    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
47    pub struct SubvolumeFlags: u64 {
48        /// Subvolume is read-only.
49        const RDONLY = BTRFS_SUBVOL_RDONLY as u64;
50    }
51}
52
53impl std::fmt::Display for SubvolumeFlags {
54    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
55        if self.contains(SubvolumeFlags::RDONLY) {
56            write!(f, "readonly")
57        } else {
58            write!(f, "-")
59        }
60    }
61}
62
63/// Subvolume metadata returned by `BTRFS_IOC_GET_SUBVOL_INFO`.
64#[derive(Debug, Clone)]
65pub struct SubvolumeInfo {
66    /// Root ID (subvolume ID) of this subvolume.
67    pub id: u64,
68    /// Name of this subvolume within its parent directory.
69    pub name: String,
70    /// Root ID of the parent subvolume.
71    pub parent_id: u64,
72    /// Inode number of the directory within the parent that holds this subvolume.
73    pub dir_id: u64,
74    /// Current generation of the subvolume.
75    pub generation: u64,
76    /// Subvolume flags (e.g. read-only).
77    pub flags: SubvolumeFlags,
78    /// UUID of this subvolume.
79    pub uuid: Uuid,
80    /// UUID of the parent subvolume (non-nil for snapshots).
81    pub parent_uuid: Uuid,
82    /// UUID of the received subvolume (non-nil for received snapshots).
83    pub received_uuid: Uuid,
84    /// Transaction ID when the subvolume was last changed.
85    pub ctransid: u64,
86    /// Transaction ID when the subvolume was created.
87    pub otransid: u64,
88    /// Transaction ID when a send was last performed.
89    pub stransid: u64,
90    /// Transaction ID when this subvolume was last received.
91    pub rtransid: u64,
92    /// Time of last change.
93    pub ctime: SystemTime,
94    /// Creation time.
95    pub otime: SystemTime,
96    /// Time of last send.
97    pub stime: SystemTime,
98    /// Time of last receive.
99    pub rtime: SystemTime,
100}
101
102/// A single subvolume entry returned by [`subvolume_list`].
103#[derive(Debug, Clone)]
104pub struct SubvolumeListItem {
105    /// Root ID (subvolume ID).
106    pub root_id: u64,
107    /// Root ID of the parent subvolume (`0` if unknown / not found).
108    pub parent_id: u64,
109    /// Inode of the directory in the parent that contains this subvolume.
110    pub dir_id: u64,
111    /// Current generation.
112    pub generation: u64,
113    /// Subvolume flags.
114    pub flags: SubvolumeFlags,
115    /// UUID of this subvolume.
116    pub uuid: Uuid,
117    /// UUID of the parent subvolume.
118    pub parent_uuid: Uuid,
119    /// UUID of the received subvolume.
120    pub received_uuid: Uuid,
121    /// Transaction ID when created.
122    pub otransid: u64,
123    /// Creation time.
124    pub otime: SystemTime,
125    /// Subvolume name (the leaf name within its parent).
126    ///
127    /// Full path resolution relative to the filesystem root requires
128    /// `BTRFS_IOC_INO_LOOKUP` and is not yet implemented; this field contains
129    /// only the leaf name as stored in the root backref item.
130    pub name: String,
131}
132
133/// Write `name` into the `name` union member of a zeroed
134/// `btrfs_ioctl_vol_args_v2`, returning `ENAMETOOLONG` if it does not fit.
135#[allow(clippy::cast_possible_wrap)] // ASCII bytes always fit in c_char
136fn set_v2_name(
137    args: &mut btrfs_ioctl_vol_args_v2,
138    name: &CStr,
139) -> nix::Result<()> {
140    let bytes = name.to_bytes(); // excludes nul terminator
141    // SAFETY: name is the active union member; the struct is already zeroed so
142    // the implicit nul terminator is already present.
143    let name_buf: &mut [c_char] = unsafe { &mut args.__bindgen_anon_2.name };
144    if bytes.len() >= name_buf.len() {
145        return Err(nix::errno::Errno::ENAMETOOLONG);
146    }
147    for (i, &b) in bytes.iter().enumerate() {
148        name_buf[i] = b as c_char;
149    }
150    Ok(())
151}
152
153/// Build a `btrfs_qgroup_inherit` buffer for the given qgroup IDs.
154///
155/// The returned `Vec<u64>` is sized to hold the base struct plus the trailing
156/// `qgroups[]` array, with 8-byte alignment guaranteed by the `u64` element
157/// type.
158fn build_qgroup_inherit(qgroups: &[u64]) -> Vec<u64> {
159    let base_size = mem::size_of::<btrfs_qgroup_inherit>();
160    let total_size = base_size + std::mem::size_of_val(qgroups);
161    let num_u64 = total_size.div_ceil(8);
162    let mut buf = vec![0u64; num_u64];
163
164    // SAFETY: buf is large enough and zeroed; we write through a properly
165    // aligned pointer (btrfs_qgroup_inherit has 8-byte alignment).
166    let inherit =
167        unsafe { &mut *buf.as_mut_ptr().cast::<btrfs_qgroup_inherit>() };
168    inherit.num_qgroups = qgroups.len() as u64;
169
170    // Write the qgroup IDs into the flexible array member.
171    if !qgroups.is_empty() {
172        let array = unsafe { inherit.qgroups.as_mut_slice(qgroups.len()) };
173        array.copy_from_slice(qgroups);
174    }
175
176    buf
177}
178
179/// Set the `BTRFS_SUBVOL_QGROUP_INHERIT` fields on a `vol_args_v2` struct.
180///
181/// `buf` must be the buffer returned by `build_qgroup_inherit`.
182fn set_qgroup_inherit(
183    args: &mut btrfs_ioctl_vol_args_v2,
184    buf: &[u64],
185    num_qgroups: usize,
186) {
187    args.flags |= u64::from(BTRFS_SUBVOL_QGROUP_INHERIT);
188    let base_size = mem::size_of::<btrfs_qgroup_inherit>();
189    let total_size = base_size + num_qgroups * mem::size_of::<u64>();
190    args.__bindgen_anon_1.__bindgen_anon_1.size = total_size as u64;
191    args.__bindgen_anon_1.__bindgen_anon_1.qgroup_inherit =
192        buf.as_ptr() as *mut btrfs_qgroup_inherit;
193}
194
195/// Create a new subvolume named `name` inside the directory referred to by
196/// `parent_fd`.
197///
198/// `name` must be a plain leaf name (no slashes).  The caller is responsible
199/// for opening the correct parent directory.  If `qgroups` is non-empty, the
200/// new subvolume is added to those qgroups.  Requires `CAP_SYS_ADMIN`.
201///
202/// Errors: ENAMETOOLONG if `name` does not fit in the 4040-byte kernel
203/// buffer.  EEXIST if a subvolume or directory with that name already exists.
204/// `EPERM` without `CAP_SYS_ADMIN`.
205///
206/// # Errors
207///
208/// Returns `Err` if the ioctl fails.
209pub fn subvolume_create(
210    parent_fd: BorrowedFd,
211    name: &CStr,
212    qgroups: &[u64],
213) -> nix::Result<()> {
214    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
215    set_v2_name(&mut args, name)?;
216
217    let inherit_buf;
218    if !qgroups.is_empty() {
219        inherit_buf = build_qgroup_inherit(qgroups);
220        set_qgroup_inherit(&mut args, &inherit_buf, qgroups.len());
221    }
222
223    unsafe {
224        btrfs_ioc_subvol_create_v2(parent_fd.as_raw_fd(), &raw const args)
225    }?;
226    Ok(())
227}
228
229/// Delete the subvolume or snapshot named `name` from the directory referred
230/// to by `parent_fd`.
231///
232/// `name` must be a plain leaf name (no slashes).  Requires `CAP_SYS_ADMIN`.
233///
234/// Deletion is asynchronous: the ioctl removes the directory entry
235/// immediately, but the kernel cleaner thread reclaims the on-disk data
236/// in the background. Until the next transaction commit the deletion is
237/// not visible to other operations (e.g. `subvolume_list` still shows
238/// the subvolume). Call `sync` to force a commit, or pass
239/// `-c`/`--commit-after` at the CLI level. To wait for the cleaner to
240/// finish, use [`subvol_sync_wait_one`].
241///
242/// # Errors
243///
244/// Returns `Err` if the ioctl fails.
245pub fn subvolume_delete(parent_fd: BorrowedFd, name: &CStr) -> nix::Result<()> {
246    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
247    set_v2_name(&mut args, name)?;
248    unsafe {
249        btrfs_ioc_snap_destroy_v2(parent_fd.as_raw_fd(), &raw const args)
250    }?;
251    Ok(())
252}
253
254/// Delete a subvolume by its numeric subvolume ID.
255///
256/// `fd` must be an open file descriptor on the filesystem (typically the mount
257/// point).  Unlike `subvolume_delete`, this does not require knowing the
258/// subvolume's path.  Requires `CAP_SYS_ADMIN`.
259///
260/// See [`subvolume_delete`] for details on commit visibility and async
261/// cleanup.
262///
263/// # Errors
264///
265/// Returns `Err` if the ioctl fails.
266pub fn subvolume_delete_by_id(
267    fd: BorrowedFd,
268    subvolid: u64,
269) -> nix::Result<()> {
270    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
271    args.flags = u64::from(BTRFS_SUBVOL_SPEC_BY_ID);
272    args.__bindgen_anon_2.subvolid = subvolid;
273    unsafe { btrfs_ioc_snap_destroy_v2(fd.as_raw_fd(), &raw const args) }?;
274    Ok(())
275}
276
277/// Create a snapshot of the subvolume referred to by `source_fd`, placing it
278/// as `name` inside the directory referred to by `parent_fd`.
279///
280/// If `readonly` is `true` the new snapshot is created read-only.  If
281/// `qgroups` is non-empty, the new snapshot is added to those qgroups.
282/// Requires `CAP_SYS_ADMIN`.
283///
284/// Errors: ENAMETOOLONG if `name` does not fit in the 4040-byte kernel
285/// buffer.  EEXIST if a subvolume or directory with that name already exists.
286/// EROFS if `parent_fd` refers to a read-only subvolume.  EPERM without
287/// `CAP_SYS_ADMIN`.
288///
289/// # Errors
290///
291/// Returns `Err` if the ioctl fails.
292pub fn snapshot_create(
293    parent_fd: BorrowedFd,
294    source_fd: BorrowedFd,
295    name: &CStr,
296    readonly: bool,
297    qgroups: &[u64],
298) -> nix::Result<()> {
299    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
300    // The `fd` field carries the source subvolume file descriptor.
301    args.fd = i64::from(source_fd.as_raw_fd());
302    if readonly {
303        args.flags = u64::from(BTRFS_SUBVOL_RDONLY);
304    }
305    set_v2_name(&mut args, name)?;
306
307    let inherit_buf;
308    if !qgroups.is_empty() {
309        inherit_buf = build_qgroup_inherit(qgroups);
310        set_qgroup_inherit(&mut args, &inherit_buf, qgroups.len());
311    }
312
313    unsafe {
314        btrfs_ioc_snap_create_v2(parent_fd.as_raw_fd(), &raw const args)
315    }?;
316    Ok(())
317}
318
319/// Query detailed information about the subvolume that `fd` belongs to.
320///
321/// `fd` can be any file or directory within the target subvolume.
322/// Does not require elevated privileges.
323///
324/// # Errors
325///
326/// Returns `Err` if the ioctl fails.
327pub fn subvolume_info(fd: BorrowedFd) -> nix::Result<SubvolumeInfo> {
328    subvolume_info_by_id(fd, 0)
329}
330
331/// Query detailed information about a subvolume by its numeric root ID.
332///
333/// `fd` can be any open file descriptor on the filesystem.  If `rootid` is 0,
334/// the subvolume that `fd` belongs to is queried (equivalent to
335/// `subvolume_info`).  Does not require elevated privileges.
336///
337/// Errors: ENOENT if no subvolume with that `rootid` exists (or has been
338/// deleted but not yet cleaned).
339///
340/// # Errors
341///
342/// Returns `Err` if the ioctl fails.
343pub fn subvolume_info_by_id(
344    fd: BorrowedFd,
345    rootid: u64,
346) -> nix::Result<SubvolumeInfo> {
347    let mut raw: btrfs_ioctl_get_subvol_info_args = unsafe { mem::zeroed() };
348    raw.treeid = rootid;
349    unsafe { btrfs_ioc_get_subvol_info(fd.as_raw_fd(), &raw mut raw) }?;
350
351    let name = unsafe { CStr::from_ptr(raw.name.as_ptr()) }
352        .to_string_lossy()
353        .into_owned();
354
355    Ok(SubvolumeInfo {
356        id: raw.treeid,
357        name,
358        parent_id: raw.parent_id,
359        dir_id: raw.dirid,
360        generation: raw.generation,
361        flags: SubvolumeFlags::from_bits_truncate(raw.flags),
362        uuid: Uuid::from_bytes(raw.uuid),
363        parent_uuid: Uuid::from_bytes(raw.parent_uuid),
364        received_uuid: Uuid::from_bytes(raw.received_uuid),
365        ctransid: raw.ctransid,
366        otransid: raw.otransid,
367        stransid: raw.stransid,
368        rtransid: raw.rtransid,
369        ctime: timespec_to_system_time(raw.ctime.sec, raw.ctime.nsec),
370        otime: timespec_to_system_time(raw.otime.sec, raw.otime.nsec),
371        stime: timespec_to_system_time(raw.stime.sec, raw.stime.nsec),
372        rtime: timespec_to_system_time(raw.rtime.sec, raw.rtime.nsec),
373    })
374}
375
376/// Read the flags of the subvolume that `fd` belongs to.
377///
378/// # Errors
379///
380/// Returns `Err` if the ioctl fails.
381pub fn subvolume_flags_get(fd: BorrowedFd) -> nix::Result<SubvolumeFlags> {
382    let mut flags: u64 = 0;
383    unsafe { btrfs_ioc_subvol_getflags(fd.as_raw_fd(), &raw mut flags) }?;
384    Ok(SubvolumeFlags::from_bits_truncate(flags))
385}
386
387/// Set the flags of the subvolume that `fd` belongs to.
388///
389/// Requires `CAP_SYS_ADMIN` to make a subvolume read-only; any user may
390/// clear the read-only flag from a subvolume they own.
391///
392/// # Errors
393///
394/// Returns `Err` if the ioctl fails.
395pub fn subvolume_flags_set(
396    fd: BorrowedFd,
397    flags: SubvolumeFlags,
398) -> nix::Result<()> {
399    let raw: u64 = flags.bits();
400    unsafe { btrfs_ioc_subvol_setflags(fd.as_raw_fd(), &raw const raw) }?;
401    Ok(())
402}
403
404/// Query the ID of the default subvolume of the filesystem referred to by
405/// `fd`.
406///
407/// Searches the root tree for the `BTRFS_DIR_ITEM_KEY` entry at objectid
408/// `BTRFS_ROOT_TREE_DIR_OBJECTID` that stores the default subvolume ID.
409/// Returns [`FS_TREE_OBJECTID`] if no default has been set.
410///
411/// Requires `CAP_SYS_ADMIN`.
412///
413/// # Errors
414///
415/// Returns `Err` if the tree search ioctl fails.
416///
417/// # Panics
418///
419/// Panics if the dir item data is malformed (slice conversion fails). This
420/// cannot happen with valid btrfs on-disk data.
421pub fn subvolume_default_get(fd: BorrowedFd) -> nix::Result<u64> {
422    let mut default_id: Option<u64> = None;
423
424    tree_search(
425        fd,
426        SearchFilter::for_objectid_range(
427            u64::from(BTRFS_ROOT_TREE_OBJECTID),
428            BTRFS_DIR_ITEM_KEY,
429            u64::from(BTRFS_ROOT_TREE_DIR_OBJECTID),
430            u64::from(BTRFS_ROOT_TREE_DIR_OBJECTID),
431        ),
432        |_hdr, data| {
433            use crate::raw::btrfs_dir_item;
434            use std::mem::{offset_of, size_of};
435
436            let header_size = size_of::<btrfs_dir_item>();
437            if data.len() < header_size {
438                return Ok(());
439            }
440            let name_off = offset_of!(btrfs_dir_item, name_len);
441            let name_len =
442                u16::from_le_bytes([data[name_off], data[name_off + 1]])
443                    as usize;
444            if data.len() < header_size + name_len {
445                return Ok(());
446            }
447            let item_name = &data[header_size..header_size + name_len];
448            if item_name == b"default" {
449                let loc_off = offset_of!(btrfs_dir_item, location);
450                let target_id = u64::from_le_bytes(
451                    data[loc_off..loc_off + 8].try_into().unwrap(),
452                );
453                default_id = Some(target_id);
454            }
455            Ok(())
456        },
457    )?;
458
459    Ok(default_id.unwrap_or(u64::from(BTRFS_FS_TREE_OBJECTID)))
460}
461
462/// Set the default subvolume of the filesystem referred to by `fd` to
463/// `subvolid`.
464///
465/// Pass [`FS_TREE_OBJECTID`] to restore the default.  Requires `CAP_SYS_ADMIN`.
466///
467/// # Errors
468///
469/// Returns `Err` if the ioctl fails.
470pub fn subvolume_default_set(fd: BorrowedFd, subvolid: u64) -> nix::Result<()> {
471    unsafe { btrfs_ioc_default_subvol(fd.as_raw_fd(), &raw const subvolid) }?;
472    Ok(())
473}
474
475/// List all user subvolumes and snapshots in the filesystem referred to by
476/// `fd` by walking the root tree.
477///
478/// Two tree-search passes are made:
479/// 1. `ROOT_ITEM_KEY` — reads each subvolume's metadata (generation, flags,
480///    UUIDs, creation time).
481/// 2. `BTRFS_ROOT_BACKREF_KEY` — reads each subvolume's parent ID and leaf name.
482///
483/// Subvolumes for which no backref is found are still included; their
484/// `parent_id`, `dir_id`, and `name` will be zeroed / empty.
485///
486/// Requires `CAP_SYS_ADMIN` for the tree search. Without it the kernel
487/// returns `EPERM`; the caller should degrade gracefully (e.g. show only the
488/// leaf name without full path resolution).
489///
490/// # Errors
491///
492/// Returns `Err` if the tree search ioctl fails.
493#[allow(clippy::cast_sign_loss)] // BTRFS_LAST_FREE_OBJECTID: i32 → u64 intentional
494pub fn subvolume_list(fd: BorrowedFd) -> nix::Result<Vec<SubvolumeListItem>> {
495    let mut items: Vec<SubvolumeListItem> = Vec::new();
496
497    tree_search(
498        fd,
499        SearchFilter::for_objectid_range(
500            u64::from(BTRFS_ROOT_TREE_OBJECTID),
501            BTRFS_ROOT_ITEM_KEY,
502            u64::from(BTRFS_FIRST_FREE_OBJECTID),
503            BTRFS_LAST_FREE_OBJECTID as u64,
504        ),
505        |hdr, data| {
506            if hdr.item_type != BTRFS_ROOT_ITEM_KEY {
507                return Ok(());
508            }
509            if let Some(item) = parse_root_item(hdr.objectid, data) {
510                items.push(item);
511            }
512            Ok(())
513        },
514    )?;
515
516    tree_search(
517        fd,
518        SearchFilter::for_objectid_range(
519            u64::from(BTRFS_ROOT_TREE_OBJECTID),
520            BTRFS_ROOT_BACKREF_KEY,
521            u64::from(BTRFS_FIRST_FREE_OBJECTID),
522            BTRFS_LAST_FREE_OBJECTID as u64,
523        ),
524        |hdr, data| {
525            if hdr.item_type != BTRFS_ROOT_BACKREF_KEY {
526                return Ok(());
527            }
528            // ROOT_BACKREF_KEY: objectid = subvol root_id, offset = parent root_id
529            let root_id = hdr.objectid;
530            let parent_id = hdr.offset;
531
532            if let Some(item) = items.iter_mut().find(|i| i.root_id == root_id)
533            {
534                // Only take the first ROOT_BACKREF for each subvolume.  A
535                // subvolume can have multiple ROOT_BACKREF entries when it is
536                // referenced from more than one parent (e.g. the subvolume
537                // also appears as a snapshot inside another subvolume).
538                // Items are returned in offset-ascending order, so the first
539                // entry has the smallest parent_id — which is the canonical
540                // location btrfs-progs uses for "top level" and path.
541                if item.parent_id == 0 {
542                    item.parent_id = parent_id;
543                    if let Some((dir_id, name)) = parse_root_ref(data) {
544                        item.dir_id = dir_id;
545                        item.name = name;
546                    }
547                }
548            }
549            Ok(())
550        },
551    )?;
552
553    // Determine which subvolume the fd is open on.  Paths are expressed
554    // relative to this subvolume, matching btrfs-progs behaviour.
555    let top_id =
556        crate::inode::lookup_path_rootid(fd).unwrap_or(FS_TREE_OBJECTID);
557
558    resolve_full_paths(fd, &mut items, top_id);
559
560    Ok(items)
561}
562
563/// Call `BTRFS_IOC_INO_LOOKUP` for `dir_id` within `parent_tree` and return
564/// the path from that tree's root to the directory.
565///
566/// The kernel returns the path with a trailing `/` when the directory is not
567/// the tree root, and an empty string when `dir_id` is the tree root itself.
568/// This prefix can be concatenated directly with the subvolume's leaf name to
569/// form the full segment within the parent.
570fn ino_lookup_dir_path(
571    fd: BorrowedFd,
572    parent_tree: u64,
573    dir_id: u64,
574) -> nix::Result<String> {
575    let mut args = btrfs_ioctl_ino_lookup_args {
576        treeid: parent_tree,
577        objectid: dir_id,
578        ..unsafe { mem::zeroed() }
579    };
580    // SAFETY: args is a valid, zeroed btrfs_ioctl_ino_lookup_args; the ioctl
581    // fills in args.name with a null-terminated path string.
582    unsafe { btrfs_ioc_ino_lookup(fd.as_raw_fd(), &raw mut args) }?;
583
584    // args.name is [c_char; 4080]; find the null terminator and decode.
585    let name_ptr: *const c_char = args.name.as_ptr();
586    // SAFETY: the ioctl guarantees null termination within the 4080-byte buffer.
587    let cstr = unsafe { CStr::from_ptr(name_ptr) };
588    Ok(cstr.to_string_lossy().into_owned())
589}
590
591/// Resolve the `name` field of every item in `items` from a bare leaf name to
592/// the full path relative to the filesystem root.
593///
594/// For each subvolume we already have `parent_id`, `dir_id`, and the leaf name
595/// from the `ROOT_BACKREF` pass.  A single `BTRFS_IOC_INO_LOOKUP` call per item
596/// gives the path from the parent tree's root down to the directory that
597/// contains the subvolume (the "dir prefix").  Concatenating that prefix with
598/// the leaf name yields the subvolume's segment within its parent.  Walking up
599/// the parent chain (using the in-memory items map) and joining segments with
600/// `/` gives the final full path.
601fn resolve_full_paths(
602    fd: BorrowedFd,
603    items: &mut [SubvolumeListItem],
604    top_id: u64,
605) {
606    // Map root_id → index for O(1) parent lookups.
607    let id_to_idx: HashMap<u64, usize> = items
608        .iter()
609        .enumerate()
610        .map(|(i, item)| (item.root_id, i))
611        .collect();
612
613    // Compute the "segment" for each item: the path of this subvolume within
614    // its immediate parent (dir prefix from INO_LOOKUP + leaf name).
615    // If INO_LOOKUP is not available (e.g. missing CAP_SYS_ADMIN), fall back
616    // to the bare leaf name so the list still works.
617    let segments: Vec<String> = items
618        .iter()
619        .map(|item| {
620            if item.parent_id == 0 || item.name.is_empty() {
621                return item.name.clone();
622            }
623            match ino_lookup_dir_path(fd, item.parent_id, item.dir_id) {
624                Ok(prefix) => format!("{}{}", prefix, item.name),
625                Err(_) => item.name.clone(),
626            }
627        })
628        .collect();
629
630    // Walk each item's parent chain, joining segments, and cache results so
631    // every ancestor is resolved at most once.
632    let mut full_paths: HashMap<u64, String> = HashMap::new();
633    let root_ids: Vec<u64> = items.iter().map(|i| i.root_id).collect();
634    for root_id in root_ids {
635        build_full_path(
636            root_id,
637            top_id,
638            &id_to_idx,
639            &segments,
640            items,
641            &mut full_paths,
642        );
643    }
644
645    for item in items.iter_mut() {
646        if let Some(path) = full_paths.remove(&item.root_id) {
647            item.name = path;
648        }
649    }
650}
651
652/// Compute the full path for `root_id`, memoizing into `cache`.
653///
654/// Walks the ancestor chain iteratively to avoid stack overflow on deep
655/// subvolume trees.  Collects segments from the target up to the FS tree
656/// root, then joins them in reverse order.
657///
658/// Cycle detection is included: `ROOT_BACKREF` entries can form mutual parent
659/// relationships (e.g. a snapshot stored inside the subvolume it was taken
660/// from), which would otherwise loop forever.
661fn build_full_path(
662    root_id: u64,
663    top_id: u64,
664    id_to_idx: &HashMap<u64, usize>,
665    segments: &[String],
666    items: &[SubvolumeListItem],
667    cache: &mut HashMap<u64, String>,
668) -> String {
669    // Collect the chain of root_ids from `root_id` up to the top subvolume
670    // (the one the fd is open on) or the FS tree root, stopping early if we
671    // hit an already-cached ancestor or a cycle.
672    let mut chain: Vec<u64> = Vec::new();
673    let mut visited: HashMap<u64, usize> = HashMap::new();
674    let mut cur = root_id;
675    loop {
676        if cache.contains_key(&cur) {
677            break;
678        }
679        if visited.contains_key(&cur) {
680            // Cycle detected: truncate the chain back to where the cycle
681            // starts so we don't join nonsensical repeated segments.
682            let cycle_start = visited[&cur];
683            chain.truncate(cycle_start);
684            break;
685        }
686        let Some(&idx) = id_to_idx.get(&cur) else {
687            break;
688        };
689        visited.insert(cur, chain.len());
690        chain.push(cur);
691        let parent = items[idx].parent_id;
692        if parent == 0
693            || parent == FS_TREE_OBJECTID
694            || parent == top_id
695            || !id_to_idx.contains_key(&parent)
696        {
697            break;
698        }
699        cur = parent;
700    }
701
702    // Resolve each node in the chain from root toward the target, building
703    // on any already-cached prefix we stopped at.
704    for &id in chain.iter().rev() {
705        let Some(&idx) = id_to_idx.get(&id) else {
706            cache.insert(id, String::new());
707            continue;
708        };
709        let segment = &segments[idx];
710        let parent_id = items[idx].parent_id;
711
712        let full_path = if parent_id == 0
713            || parent_id == FS_TREE_OBJECTID
714            || parent_id == top_id
715            || !id_to_idx.contains_key(&parent_id)
716        {
717            segment.clone()
718        } else if let Some(parent_path) = cache.get(&parent_id) {
719            if parent_path.is_empty() {
720                segment.clone()
721            } else {
722                format!("{parent_path}/{segment}")
723            }
724        } else {
725            segment.clone()
726        };
727
728        cache.insert(id, full_path);
729    }
730
731    cache.get(&root_id).cloned().unwrap_or_default()
732}
733
734/// Parse a `ROOT_ITEM` payload into a [`SubvolumeListItem`].
735fn parse_root_item(root_id: u64, data: &[u8]) -> Option<SubvolumeListItem> {
736    let ri = btrfs_disk::items::RootItem::parse(data)?;
737    let flags = SubvolumeFlags::from_bits_truncate(ri.flags.bits());
738    let otime = timespec_to_system_time(ri.otime.sec, ri.otime.nsec);
739
740    Some(SubvolumeListItem {
741        root_id,
742        parent_id: 0,
743        dir_id: 0,
744        generation: ri.generation,
745        flags,
746        uuid: ri.uuid,
747        parent_uuid: ri.parent_uuid,
748        received_uuid: ri.received_uuid,
749        otransid: ri.otransid,
750        otime,
751        name: String::new(),
752    })
753}
754
755/// Parse a `btrfs_root_ref` payload (packed, LE). The name immediately
756/// follows the fixed-size header.
757fn parse_root_ref(data: &[u8]) -> Option<(u64, String)> {
758    let rr = btrfs_disk::items::RootRef::parse(data)?;
759    let name = String::from_utf8_lossy(&rr.name).into_owned();
760    Some((rr.dirid, name))
761}
762
763/// Convert an on-disk `btrfs_timespec` (LE sec + LE nsec, packed) to
764/// [`SystemTime`].  Returns [`UNIX_EPOCH`] if sec == 0.
765/// Convert a (sec, nsec) timestamp to [`SystemTime`].
766/// Returns [`UNIX_EPOCH`] if sec == 0.
767fn timespec_to_system_time(sec: u64, nsec: u32) -> SystemTime {
768    if sec == 0 {
769        return UNIX_EPOCH;
770    }
771    UNIX_EPOCH + Duration::new(sec, nsec)
772}
773
774/// A child subvolume reference returned by [`subvol_rootrefs`].
775#[derive(Debug, Clone, Copy, PartialEq, Eq)]
776pub struct SubvolRootRef {
777    /// Root ID (tree ID) of the child subvolume.
778    pub treeid: u64,
779    /// Directory inode ID where the child is attached in the parent.
780    pub dirid: u64,
781}
782
783/// List the child subvolumes directly under the subvolume opened by `fd`.
784///
785/// Returns all subvolumes whose root is referenced from the given
786/// subvolume. The kernel returns results in batches of up to 255; this
787/// function handles continuation automatically by advancing `min_treeid`.
788///
789/// Does not require `CAP_SYS_ADMIN`.
790///
791/// Errors: `ENOTTY` on kernels older than 4.18.
792///
793/// # Errors
794///
795/// Returns `Err` if the ioctl fails.
796pub fn subvol_rootrefs(fd: BorrowedFd) -> nix::Result<Vec<SubvolRootRef>> {
797    use crate::raw::{
798        btrfs_ioc_get_subvol_rootref, btrfs_ioctl_get_subvol_rootref_args,
799    };
800
801    let mut results = Vec::new();
802    let mut min_treeid: u64 = 0;
803
804    loop {
805        let mut args: btrfs_ioctl_get_subvol_rootref_args =
806            unsafe { std::mem::zeroed() };
807        args.min_treeid = min_treeid;
808
809        let ret = unsafe {
810            btrfs_ioc_get_subvol_rootref(fd.as_raw_fd(), &raw mut args)
811        };
812
813        // The kernel returns EOVERFLOW when there are more results than
814        // fit in one batch. We read what we got and loop with the updated
815        // min_treeid.
816        let overflow = match ret {
817            Ok(_) => false,
818            Err(nix::errno::Errno::EOVERFLOW) => true,
819            Err(e) => return Err(e),
820        };
821
822        let count = args.num_items as usize;
823        for i in 0..count {
824            let r = &args.rootref[i];
825            results.push(SubvolRootRef {
826                treeid: r.treeid,
827                dirid: r.dirid,
828            });
829        }
830
831        if !overflow || count == 0 {
832            break;
833        }
834
835        // Advance past the last returned treeid for the next batch.
836        min_treeid = args.rootref[count - 1].treeid + 1;
837    }
838
839    Ok(results)
840}
841
842/// Wait for a specific deleted subvolume to be fully cleaned by the kernel.
843///
844/// Blocks until the background cleaner has finished removing the on-disk
845/// data for the given subvolume ID. Returns `Ok(())` both when the wait
846/// completes and when the subvolume is already gone (`ENOENT`).
847/// Useful after `subvolume_delete` when subsequent operations depend on
848/// the subvolume being fully gone (e.g. qgroup staleness checks).
849///
850/// # Errors
851///
852/// Returns `Err` if the ioctl fails (other than `ENOENT`).
853pub fn subvol_sync_wait_one(fd: BorrowedFd, subvolid: u64) -> nix::Result<()> {
854    let args = btrfs_ioctl_subvol_wait {
855        subvolid,
856        mode: BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE,
857        count: 0,
858    };
859    match unsafe { btrfs_ioc_subvol_sync_wait(fd.as_raw_fd(), &raw const args) }
860    {
861        Ok(_) | Err(nix::errno::Errno::ENOENT) => Ok(()),
862        Err(e) => Err(e),
863    }
864}
865
866/// Wait for all currently queued subvolume deletions to complete.
867///
868/// Blocks until every subvolume that was in the deletion queue at the time
869/// of the call has been fully cleaned. Does not wait for subvolumes
870/// deleted after the call is made.
871///
872/// # Errors
873///
874/// Returns `Err` if the ioctl fails.
875pub fn subvol_sync_wait_all(fd: BorrowedFd) -> nix::Result<()> {
876    let args = btrfs_ioctl_subvol_wait {
877        subvolid: 0,
878        mode: BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED,
879        count: 0,
880    };
881    unsafe { btrfs_ioc_subvol_sync_wait(fd.as_raw_fd(), &raw const args) }?;
882    Ok(())
883}
884
885#[cfg(test)]
886mod tests {
887    use super::*;
888    use std::{
889        collections::HashMap,
890        time::{Duration, UNIX_EPOCH},
891    };
892    use uuid::Uuid;
893
894    fn test_item(root_id: u64, parent_id: u64) -> SubvolumeListItem {
895        SubvolumeListItem {
896            root_id,
897            parent_id,
898            dir_id: 0,
899            generation: 0,
900            flags: SubvolumeFlags::empty(),
901            uuid: Uuid::nil(),
902            parent_uuid: Uuid::nil(),
903            received_uuid: Uuid::nil(),
904            otransid: 0,
905            otime: UNIX_EPOCH,
906            name: String::new(),
907        }
908    }
909
910    #[test]
911    fn timespec_zero_returns_epoch() {
912        assert_eq!(timespec_to_system_time(0, 0), UNIX_EPOCH);
913    }
914
915    #[test]
916    fn timespec_zero_sec_with_nonzero_nsec_returns_epoch() {
917        // sec==0 triggers the early return regardless of nsec
918        assert_eq!(timespec_to_system_time(0, 500_000_000), UNIX_EPOCH);
919    }
920
921    #[test]
922    fn timespec_nonzero_returns_correct_time() {
923        let t = timespec_to_system_time(1000, 500);
924        assert_eq!(t, UNIX_EPOCH + Duration::new(1000, 500));
925    }
926
927    #[test]
928    fn subvolume_flags_display_readonly() {
929        let flags = SubvolumeFlags::RDONLY;
930        assert_eq!(format!("{}", flags), "readonly");
931    }
932
933    #[test]
934    fn subvolume_flags_display_empty() {
935        let flags = SubvolumeFlags::empty();
936        assert_eq!(format!("{}", flags), "-");
937    }
938
939    #[test]
940    fn parse_root_ref_valid() {
941        // btrfs_root_ref: dirid (8 LE) + sequence (8 LE) + name_len (2 LE) + name bytes
942        let name = b"mysubvol";
943        let mut buf = Vec::new();
944        buf.extend_from_slice(&42u64.to_le_bytes()); // dirid
945        buf.extend_from_slice(&1u64.to_le_bytes()); // sequence
946        buf.extend_from_slice(&(name.len() as u16).to_le_bytes()); // name_len
947        buf.extend_from_slice(name);
948
949        let result = parse_root_ref(&buf);
950        assert!(result.is_some());
951        let (dir_id, parsed_name) = result.unwrap();
952        assert_eq!(dir_id, 42);
953        assert_eq!(parsed_name, "mysubvol");
954    }
955
956    #[test]
957    fn parse_root_ref_too_short_header() {
958        // Less than 18 bytes (sizeof btrfs_root_ref)
959        let buf = [0u8; 10];
960        assert!(parse_root_ref(&buf).is_none());
961    }
962
963    #[test]
964    fn parse_root_ref_truncated_name() {
965        // Header claims 10-byte name but buffer only has the header.
966        // The parser succeeds but returns an empty name (graceful truncation).
967        let mut buf = vec![0u8; 18];
968        buf[16] = 10; // name_len = 10
969        buf[17] = 0;
970        let result = parse_root_ref(&buf);
971        assert!(result.is_some());
972        let (_, name) = result.unwrap();
973        assert!(name.is_empty());
974    }
975
976    #[test]
977    fn parse_root_ref_empty_name() {
978        let mut buf = Vec::new();
979        buf.extend_from_slice(&100u64.to_le_bytes()); // dirid
980        buf.extend_from_slice(&0u64.to_le_bytes()); // sequence
981        buf.extend_from_slice(&0u16.to_le_bytes()); // name_len = 0
982
983        let result = parse_root_ref(&buf);
984        assert!(result.is_some());
985        let (dir_id, parsed_name) = result.unwrap();
986        assert_eq!(dir_id, 100);
987        assert_eq!(parsed_name, "");
988    }
989
990    #[test]
991    fn build_full_path_single_subvol_parent_fs_tree() {
992        // Subvolume 256, parent is FS_TREE (5)
993        let items = vec![test_item(256, FS_TREE_OBJECTID)];
994        let segments = vec!["mysub".to_string()];
995        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
996        let mut cache = HashMap::new();
997
998        let path = build_full_path(
999            256,
1000            FS_TREE_OBJECTID,
1001            &id_to_idx,
1002            &segments,
1003            &items,
1004            &mut cache,
1005        );
1006        assert_eq!(path, "mysub");
1007    }
1008
1009    #[test]
1010    fn build_full_path_nested_chain() {
1011        // A (256) -> B (257) -> C (258), all parented under FS_TREE
1012        let items = vec![
1013            test_item(256, FS_TREE_OBJECTID),
1014            test_item(257, 256),
1015            test_item(258, 257),
1016        ];
1017        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1018        let id_to_idx: HashMap<u64, usize> =
1019            [(256, 0), (257, 1), (258, 2)].into();
1020        let mut cache = HashMap::new();
1021
1022        let path = build_full_path(
1023            258,
1024            FS_TREE_OBJECTID,
1025            &id_to_idx,
1026            &segments,
1027            &items,
1028            &mut cache,
1029        );
1030        assert_eq!(path, "A/B/C");
1031    }
1032
1033    #[test]
1034    fn build_full_path_stops_at_top_id() {
1035        // A (256) -> B (257) -> C (258), top_id = 257 (B)
1036        // Paths are relative to top_id, so C's parent (257) == top_id means
1037        // C's path is just its own segment, not "A/B/C".
1038        let items = vec![
1039            test_item(256, FS_TREE_OBJECTID),
1040            test_item(257, 256),
1041            test_item(258, 257),
1042        ];
1043        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1044        let id_to_idx: HashMap<u64, usize> =
1045            [(256, 0), (257, 1), (258, 2)].into();
1046        let mut cache = HashMap::new();
1047
1048        let path = build_full_path(
1049            258, 257, &id_to_idx, &segments, &items, &mut cache,
1050        );
1051        assert_eq!(path, "C");
1052
1053        // B's path is also just "B" (its parent 256/A is below top_id in the
1054        // tree, but B's own parent is not top_id — A's parent is FS_TREE).
1055        // With top_id=257, building B: parent=256, 256 is in id_to_idx but
1056        // 256's parent is FS_TREE (5) which triggers the stop, so chain = [257, 256],
1057        // and A gets its segment, B gets "A/B".
1058        let path_b = build_full_path(
1059            257, 257, &id_to_idx, &segments, &items, &mut cache,
1060        );
1061        // 257 itself: its parent is 256, 256 != top_id (257), so we walk up.
1062        // 256's parent is FS_TREE (5), which triggers stop. chain = [257, 256].
1063        // 256 resolves to "A" (parent is FS_TREE), 257 resolves to "A/B".
1064        assert_eq!(path_b, "A/B");
1065    }
1066
1067    #[test]
1068    fn build_full_path_cycle_detection() {
1069        // A (256) parent is B (257), B (257) parent is A (256) — mutual cycle
1070        let items = vec![test_item(256, 257), test_item(257, 256)];
1071        let segments = vec!["A".to_string(), "B".to_string()];
1072        let id_to_idx: HashMap<u64, usize> = [(256, 0), (257, 1)].into();
1073        let mut cache = HashMap::new();
1074
1075        // Must not hang. The result is truncated due to cycle detection.
1076        let _path = build_full_path(
1077            256,
1078            FS_TREE_OBJECTID,
1079            &id_to_idx,
1080            &segments,
1081            &items,
1082            &mut cache,
1083        );
1084        // Just verify it terminates and returns something (exact value depends
1085        // on cycle truncation heuristic).
1086    }
1087
1088    #[test]
1089    fn build_full_path_cached_ancestor() {
1090        // A (256) -> B (257) -> C (258)
1091        // Pre-cache B's path; building C should use it.
1092        let items = vec![
1093            test_item(256, FS_TREE_OBJECTID),
1094            test_item(257, 256),
1095            test_item(258, 257),
1096        ];
1097        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1098        let id_to_idx: HashMap<u64, usize> =
1099            [(256, 0), (257, 1), (258, 2)].into();
1100        let mut cache = HashMap::new();
1101        cache.insert(257, "A/B".to_string());
1102
1103        let path = build_full_path(
1104            258,
1105            FS_TREE_OBJECTID,
1106            &id_to_idx,
1107            &segments,
1108            &items,
1109            &mut cache,
1110        );
1111        assert_eq!(path, "A/B/C");
1112    }
1113
1114    #[test]
1115    fn build_full_path_unknown_parent() {
1116        // Subvolume 256, parent 999 not in id_to_idx
1117        let items = vec![test_item(256, 999)];
1118        let segments = vec!["orphan".to_string()];
1119        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1120        let mut cache = HashMap::new();
1121
1122        let path = build_full_path(
1123            256,
1124            FS_TREE_OBJECTID,
1125            &id_to_idx,
1126            &segments,
1127            &items,
1128            &mut cache,
1129        );
1130        assert_eq!(path, "orphan");
1131    }
1132
1133    #[test]
1134    fn build_full_path_parent_id_zero() {
1135        // Subvolume 256, parent_id == 0 (no backref found)
1136        let items = vec![test_item(256, 0)];
1137        let segments = vec!["noparent".to_string()];
1138        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1139        let mut cache = HashMap::new();
1140
1141        let path = build_full_path(
1142            256,
1143            FS_TREE_OBJECTID,
1144            &id_to_idx,
1145            &segments,
1146            &items,
1147            &mut cache,
1148        );
1149        assert_eq!(path, "noparent");
1150    }
1151
1152    #[test]
1153    fn build_full_path_already_cached_target() {
1154        // If the target itself is already cached, return the cached value.
1155        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1156        let segments = vec!["A".to_string()];
1157        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1158        let mut cache = HashMap::new();
1159        cache.insert(256, "cached/path".to_string());
1160
1161        let path = build_full_path(
1162            256,
1163            FS_TREE_OBJECTID,
1164            &id_to_idx,
1165            &segments,
1166            &items,
1167            &mut cache,
1168        );
1169        assert_eq!(path, "cached/path");
1170    }
1171
1172    #[test]
1173    fn build_full_path_root_id_not_in_items() {
1174        // root_id not present in id_to_idx at all
1175        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1176        let segments = vec!["A".to_string()];
1177        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1178        let mut cache = HashMap::new();
1179
1180        let path = build_full_path(
1181            999,
1182            FS_TREE_OBJECTID,
1183            &id_to_idx,
1184            &segments,
1185            &items,
1186            &mut cache,
1187        );
1188        assert_eq!(path, "");
1189    }
1190}