Skip to main content

btrfs_uapi/
subvolume.rs

1//! # Subvolume and snapshot management: creating, deleting, and querying subvolumes
2//!
3//! Subvolumes are independently snapshotable subtrees within a btrfs filesystem.
4//! Snapshots are subvolumes created as copy-on-write clones of an existing
5//! subvolume.  This module covers the full lifecycle: creating and deleting
6//! subvolumes and snapshots, reading subvolume metadata and flags, listing all
7//! subvolumes in a filesystem, and getting or setting the default subvolume
8//! that is mounted when no subvolume is explicitly requested.
9
10use crate::{
11    field_size,
12    raw::{
13        BTRFS_DIR_ITEM_KEY, BTRFS_FIRST_FREE_OBJECTID, BTRFS_FS_TREE_OBJECTID,
14        BTRFS_LAST_FREE_OBJECTID, BTRFS_ROOT_BACKREF_KEY, BTRFS_ROOT_ITEM_KEY,
15        BTRFS_ROOT_TREE_DIR_OBJECTID, BTRFS_ROOT_TREE_OBJECTID,
16        BTRFS_SUBVOL_QGROUP_INHERIT, BTRFS_SUBVOL_RDONLY,
17        BTRFS_SUBVOL_SPEC_BY_ID, BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE,
18        BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED, btrfs_ioc_default_subvol,
19        btrfs_ioc_get_subvol_info, btrfs_ioc_ino_lookup,
20        btrfs_ioc_snap_create_v2, btrfs_ioc_snap_destroy_v2,
21        btrfs_ioc_subvol_create_v2, btrfs_ioc_subvol_getflags,
22        btrfs_ioc_subvol_setflags, btrfs_ioc_subvol_sync_wait,
23        btrfs_ioctl_get_subvol_info_args, btrfs_ioctl_ino_lookup_args,
24        btrfs_ioctl_subvol_wait, btrfs_ioctl_vol_args_v2, btrfs_qgroup_inherit,
25        btrfs_root_item, btrfs_timespec,
26    },
27    tree_search::{SearchKey, tree_search},
28};
29use bitflags::bitflags;
30use nix::libc::c_char;
31use std::{
32    collections::HashMap,
33    ffi::CStr,
34    mem,
35    os::{fd::AsRawFd, unix::io::BorrowedFd},
36    time::{Duration, SystemTime, UNIX_EPOCH},
37};
38use uuid::Uuid;
39
40/// The top-level subvolume (FS tree); objectid 5, always present.
41///
42/// Returned by [`subvolume_default_get`] when no explicit default has been set.
43pub const FS_TREE_OBJECTID: u64 = BTRFS_FS_TREE_OBJECTID as u64;
44
45bitflags! {
46    /// Flags on a btrfs subvolume (the `flags` field of the root item /
47    /// `BTRFS_IOC_SUBVOL_{GET,SET}FLAGS`).
48    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
49    pub struct SubvolumeFlags: u64 {
50        /// Subvolume is read-only.
51        const RDONLY = BTRFS_SUBVOL_RDONLY as u64;
52    }
53}
54
55impl std::fmt::Display for SubvolumeFlags {
56    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
57        if self.contains(SubvolumeFlags::RDONLY) {
58            write!(f, "readonly")
59        } else {
60            write!(f, "-")
61        }
62    }
63}
64
65/// Subvolume metadata returned by `BTRFS_IOC_GET_SUBVOL_INFO`.
66#[derive(Debug, Clone)]
67pub struct SubvolumeInfo {
68    /// Root ID (subvolume ID) of this subvolume.
69    pub id: u64,
70    /// Name of this subvolume within its parent directory.
71    pub name: String,
72    /// Root ID of the parent subvolume.
73    pub parent_id: u64,
74    /// Inode number of the directory within the parent that holds this subvolume.
75    pub dir_id: u64,
76    /// Current generation of the subvolume.
77    pub generation: u64,
78    /// Subvolume flags (e.g. read-only).
79    pub flags: SubvolumeFlags,
80    /// UUID of this subvolume.
81    pub uuid: Uuid,
82    /// UUID of the parent subvolume (non-nil for snapshots).
83    pub parent_uuid: Uuid,
84    /// UUID of the received subvolume (non-nil for received snapshots).
85    pub received_uuid: Uuid,
86    /// Transaction ID when the subvolume was last changed.
87    pub ctransid: u64,
88    /// Transaction ID when the subvolume was created.
89    pub otransid: u64,
90    /// Transaction ID when a send was last performed.
91    pub stransid: u64,
92    /// Transaction ID when this subvolume was last received.
93    pub rtransid: u64,
94    /// Time of last change.
95    pub ctime: SystemTime,
96    /// Creation time.
97    pub otime: SystemTime,
98    /// Time of last send.
99    pub stime: SystemTime,
100    /// Time of last receive.
101    pub rtime: SystemTime,
102}
103
104/// A single subvolume entry returned by [`subvolume_list`].
105#[derive(Debug, Clone)]
106pub struct SubvolumeListItem {
107    /// Root ID (subvolume ID).
108    pub root_id: u64,
109    /// Root ID of the parent subvolume (`0` if unknown / not found).
110    pub parent_id: u64,
111    /// Inode of the directory in the parent that contains this subvolume.
112    pub dir_id: u64,
113    /// Current generation.
114    pub generation: u64,
115    /// Subvolume flags.
116    pub flags: SubvolumeFlags,
117    /// UUID of this subvolume.
118    pub uuid: Uuid,
119    /// UUID of the parent subvolume.
120    pub parent_uuid: Uuid,
121    /// UUID of the received subvolume.
122    pub received_uuid: Uuid,
123    /// Transaction ID when created.
124    pub otransid: u64,
125    /// Creation time.
126    pub otime: SystemTime,
127    /// Subvolume name (the leaf name within its parent).
128    ///
129    /// Full path resolution relative to the filesystem root requires
130    /// `BTRFS_IOC_INO_LOOKUP` and is not yet implemented; this field contains
131    /// only the leaf name as stored in the root backref item.
132    pub name: String,
133}
134
135/// Write `name` into the `name` union member of a zeroed
136/// `btrfs_ioctl_vol_args_v2`, returning `ENAMETOOLONG` if it does not fit.
137fn set_v2_name(
138    args: &mut btrfs_ioctl_vol_args_v2,
139    name: &CStr,
140) -> nix::Result<()> {
141    let bytes = name.to_bytes(); // excludes nul terminator
142    // SAFETY: name is the active union member; the struct is already zeroed so
143    // the implicit nul terminator is already present.
144    let name_buf: &mut [c_char] = unsafe { &mut args.__bindgen_anon_2.name };
145    if bytes.len() >= name_buf.len() {
146        return Err(nix::errno::Errno::ENAMETOOLONG);
147    }
148    for (i, &b) in bytes.iter().enumerate() {
149        name_buf[i] = b as c_char;
150    }
151    Ok(())
152}
153
154/// Build a `btrfs_qgroup_inherit` buffer for the given qgroup IDs.
155///
156/// The returned `Vec<u64>` is sized to hold the base struct plus the trailing
157/// `qgroups[]` array, with 8-byte alignment guaranteed by the `u64` element
158/// type.
159fn build_qgroup_inherit(qgroups: &[u64]) -> Vec<u64> {
160    let base_size = mem::size_of::<btrfs_qgroup_inherit>();
161    let total_size = base_size + std::mem::size_of_val(qgroups);
162    let num_u64 = total_size.div_ceil(8);
163    let mut buf = vec![0u64; num_u64];
164
165    // SAFETY: buf is large enough and zeroed; we write through a properly
166    // aligned pointer (btrfs_qgroup_inherit has 8-byte alignment).
167    let inherit =
168        unsafe { &mut *(buf.as_mut_ptr() as *mut btrfs_qgroup_inherit) };
169    inherit.num_qgroups = qgroups.len() as u64;
170
171    // Write the qgroup IDs into the flexible array member.
172    if !qgroups.is_empty() {
173        let array = unsafe { inherit.qgroups.as_mut_slice(qgroups.len()) };
174        array.copy_from_slice(qgroups);
175    }
176
177    buf
178}
179
180/// Set the `BTRFS_SUBVOL_QGROUP_INHERIT` fields on a `vol_args_v2` struct.
181///
182/// `buf` must be the buffer returned by `build_qgroup_inherit`.
183fn set_qgroup_inherit(
184    args: &mut btrfs_ioctl_vol_args_v2,
185    buf: &[u64],
186    num_qgroups: usize,
187) {
188    args.flags |= BTRFS_SUBVOL_QGROUP_INHERIT as u64;
189    let base_size = mem::size_of::<btrfs_qgroup_inherit>();
190    let total_size = base_size + num_qgroups * mem::size_of::<u64>();
191    args.__bindgen_anon_1.__bindgen_anon_1.size = total_size as u64;
192    args.__bindgen_anon_1.__bindgen_anon_1.qgroup_inherit =
193        buf.as_ptr() as *mut btrfs_qgroup_inherit;
194}
195
196/// Create a new subvolume named `name` inside the directory referred to by
197/// `parent_fd`.
198///
199/// `name` must be a plain leaf name (no slashes).  The caller is responsible
200/// for opening the correct parent directory.  If `qgroups` is non-empty, the
201/// new subvolume is added to those qgroups.  Requires `CAP_SYS_ADMIN`.
202///
203/// Errors: ENAMETOOLONG if `name` does not fit in the 4040-byte kernel
204/// buffer.  EEXIST if a subvolume or directory with that name already exists.
205/// EPERM without `CAP_SYS_ADMIN`.
206pub fn subvolume_create(
207    parent_fd: BorrowedFd,
208    name: &CStr,
209    qgroups: &[u64],
210) -> nix::Result<()> {
211    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
212    set_v2_name(&mut args, name)?;
213
214    let inherit_buf;
215    if !qgroups.is_empty() {
216        inherit_buf = build_qgroup_inherit(qgroups);
217        set_qgroup_inherit(&mut args, &inherit_buf, qgroups.len());
218    }
219
220    unsafe { btrfs_ioc_subvol_create_v2(parent_fd.as_raw_fd(), &args) }?;
221    Ok(())
222}
223
224/// Delete the subvolume or snapshot named `name` from the directory referred
225/// to by `parent_fd`.
226///
227/// `name` must be a plain leaf name (no slashes).  Requires `CAP_SYS_ADMIN`.
228///
229/// Deletion is asynchronous: the ioctl removes the directory entry
230/// immediately, but the kernel cleaner thread reclaims the on-disk data
231/// in the background. Until the next transaction commit the deletion is
232/// not visible to other operations (e.g. `subvolume_list` still shows
233/// the subvolume). Call `sync` to force a commit, or pass
234/// `-c`/`--commit-after` at the CLI level. To wait for the cleaner to
235/// finish, use [`subvol_sync_wait_one`].
236pub fn subvolume_delete(parent_fd: BorrowedFd, name: &CStr) -> nix::Result<()> {
237    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
238    set_v2_name(&mut args, name)?;
239    unsafe { btrfs_ioc_snap_destroy_v2(parent_fd.as_raw_fd(), &args) }?;
240    Ok(())
241}
242
243/// Delete a subvolume by its numeric subvolume ID.
244///
245/// `fd` must be an open file descriptor on the filesystem (typically the mount
246/// point).  Unlike `subvolume_delete`, this does not require knowing the
247/// subvolume's path.  Requires `CAP_SYS_ADMIN`.
248///
249/// See [`subvolume_delete`] for details on commit visibility and async
250/// cleanup.
251pub fn subvolume_delete_by_id(
252    fd: BorrowedFd,
253    subvolid: u64,
254) -> nix::Result<()> {
255    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
256    args.flags = BTRFS_SUBVOL_SPEC_BY_ID as u64;
257    args.__bindgen_anon_2.subvolid = subvolid;
258    unsafe { btrfs_ioc_snap_destroy_v2(fd.as_raw_fd(), &args) }?;
259    Ok(())
260}
261
262/// Create a snapshot of the subvolume referred to by `source_fd`, placing it
263/// as `name` inside the directory referred to by `parent_fd`.
264///
265/// If `readonly` is `true` the new snapshot is created read-only.  If
266/// `qgroups` is non-empty, the new snapshot is added to those qgroups.
267/// Requires `CAP_SYS_ADMIN`.
268///
269/// Errors: ENAMETOOLONG if `name` does not fit in the 4040-byte kernel
270/// buffer.  EEXIST if a subvolume or directory with that name already exists.
271/// EROFS if `parent_fd` refers to a read-only subvolume.  EPERM without
272/// `CAP_SYS_ADMIN`.
273pub fn snapshot_create(
274    parent_fd: BorrowedFd,
275    source_fd: BorrowedFd,
276    name: &CStr,
277    readonly: bool,
278    qgroups: &[u64],
279) -> nix::Result<()> {
280    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
281    // The `fd` field carries the source subvolume file descriptor.
282    args.fd = source_fd.as_raw_fd() as i64;
283    if readonly {
284        args.flags = BTRFS_SUBVOL_RDONLY as u64;
285    }
286    set_v2_name(&mut args, name)?;
287
288    let inherit_buf;
289    if !qgroups.is_empty() {
290        inherit_buf = build_qgroup_inherit(qgroups);
291        set_qgroup_inherit(&mut args, &inherit_buf, qgroups.len());
292    }
293
294    unsafe { btrfs_ioc_snap_create_v2(parent_fd.as_raw_fd(), &args) }?;
295    Ok(())
296}
297
298/// Query detailed information about the subvolume that `fd` belongs to.
299///
300/// `fd` can be any file or directory within the target subvolume.
301/// Does not require elevated privileges.
302pub fn subvolume_info(fd: BorrowedFd) -> nix::Result<SubvolumeInfo> {
303    subvolume_info_by_id(fd, 0)
304}
305
306/// Query detailed information about a subvolume by its numeric root ID.
307///
308/// `fd` can be any open file descriptor on the filesystem.  If `rootid` is 0,
309/// the subvolume that `fd` belongs to is queried (equivalent to
310/// `subvolume_info`).  Does not require elevated privileges.
311///
312/// Errors: ENOENT if no subvolume with that `rootid` exists (or has been
313/// deleted but not yet cleaned).
314pub fn subvolume_info_by_id(
315    fd: BorrowedFd,
316    rootid: u64,
317) -> nix::Result<SubvolumeInfo> {
318    let mut raw: btrfs_ioctl_get_subvol_info_args = unsafe { mem::zeroed() };
319    raw.treeid = rootid;
320    unsafe { btrfs_ioc_get_subvol_info(fd.as_raw_fd(), &mut raw) }?;
321
322    let name = unsafe { CStr::from_ptr(raw.name.as_ptr()) }
323        .to_string_lossy()
324        .into_owned();
325
326    Ok(SubvolumeInfo {
327        id: raw.treeid,
328        name,
329        parent_id: raw.parent_id,
330        dir_id: raw.dirid,
331        generation: raw.generation,
332        flags: SubvolumeFlags::from_bits_truncate(raw.flags),
333        uuid: Uuid::from_bytes(raw.uuid),
334        parent_uuid: Uuid::from_bytes(raw.parent_uuid),
335        received_uuid: Uuid::from_bytes(raw.received_uuid),
336        ctransid: raw.ctransid,
337        otransid: raw.otransid,
338        stransid: raw.stransid,
339        rtransid: raw.rtransid,
340        ctime: ioctl_timespec_to_system_time(raw.ctime.sec, raw.ctime.nsec),
341        otime: ioctl_timespec_to_system_time(raw.otime.sec, raw.otime.nsec),
342        stime: ioctl_timespec_to_system_time(raw.stime.sec, raw.stime.nsec),
343        rtime: ioctl_timespec_to_system_time(raw.rtime.sec, raw.rtime.nsec),
344    })
345}
346
347/// Read the flags of the subvolume that `fd` belongs to.
348pub fn subvolume_flags_get(fd: BorrowedFd) -> nix::Result<SubvolumeFlags> {
349    let mut flags: u64 = 0;
350    unsafe { btrfs_ioc_subvol_getflags(fd.as_raw_fd(), &mut flags) }?;
351    Ok(SubvolumeFlags::from_bits_truncate(flags))
352}
353
354/// Set the flags of the subvolume that `fd` belongs to.
355///
356/// Requires `CAP_SYS_ADMIN` to make a subvolume read-only; any user may
357/// clear the read-only flag from a subvolume they own.
358pub fn subvolume_flags_set(
359    fd: BorrowedFd,
360    flags: SubvolumeFlags,
361) -> nix::Result<()> {
362    let raw: u64 = flags.bits();
363    unsafe { btrfs_ioc_subvol_setflags(fd.as_raw_fd(), &raw) }?;
364    Ok(())
365}
366
367/// Query the ID of the default subvolume of the filesystem referred to by
368/// `fd`.
369///
370/// Searches the root tree for the `BTRFS_DIR_ITEM_KEY` entry at objectid
371/// `BTRFS_ROOT_TREE_DIR_OBJECTID` that stores the default subvolume ID.
372/// Returns [`FS_TREE_OBJECTID`] if no default has been set.
373///
374/// Requires `CAP_SYS_ADMIN`.
375pub fn subvolume_default_get(fd: BorrowedFd) -> nix::Result<u64> {
376    let mut default_id: Option<u64> = None;
377
378    tree_search(
379        fd,
380        SearchKey::for_objectid_range(
381            BTRFS_ROOT_TREE_OBJECTID as u64,
382            BTRFS_DIR_ITEM_KEY,
383            BTRFS_ROOT_TREE_DIR_OBJECTID as u64,
384            BTRFS_ROOT_TREE_DIR_OBJECTID as u64,
385        ),
386        |_hdr, data| {
387            use crate::raw::btrfs_dir_item;
388            use std::mem::{offset_of, size_of};
389
390            let header_size = size_of::<btrfs_dir_item>();
391            if data.len() < header_size {
392                return Ok(());
393            }
394            let name_off = offset_of!(btrfs_dir_item, name_len);
395            let name_len =
396                u16::from_le_bytes([data[name_off], data[name_off + 1]])
397                    as usize;
398            if data.len() < header_size + name_len {
399                return Ok(());
400            }
401            let item_name = &data[header_size..header_size + name_len];
402            if item_name == b"default" {
403                let loc_off = offset_of!(btrfs_dir_item, location);
404                let target_id = u64::from_le_bytes(
405                    data[loc_off..loc_off + 8].try_into().unwrap(),
406                );
407                default_id = Some(target_id);
408            }
409            Ok(())
410        },
411    )?;
412
413    Ok(default_id.unwrap_or(BTRFS_FS_TREE_OBJECTID as u64))
414}
415
416/// Set the default subvolume of the filesystem referred to by `fd` to
417/// `subvolid`.
418///
419/// Pass [`FS_TREE_OBJECTID`] to restore the default.  Requires `CAP_SYS_ADMIN`.
420pub fn subvolume_default_set(fd: BorrowedFd, subvolid: u64) -> nix::Result<()> {
421    unsafe { btrfs_ioc_default_subvol(fd.as_raw_fd(), &subvolid) }?;
422    Ok(())
423}
424
425/// List all user subvolumes and snapshots in the filesystem referred to by
426/// `fd` by walking the root tree.
427///
428/// Two tree-search passes are made:
429/// 1. `ROOT_ITEM_KEY` — reads each subvolume's metadata (generation, flags,
430///    UUIDs, creation time).
431/// 2. `BTRFS_ROOT_BACKREF_KEY` — reads each subvolume's parent ID and leaf name.
432///
433/// Subvolumes for which no backref is found are still included; their
434/// `parent_id`, `dir_id`, and `name` will be zeroed / empty.
435///
436/// Requires `CAP_SYS_ADMIN` for the tree search. Without it the kernel
437/// returns EPERM; the caller should degrade gracefully (e.g. show only the
438/// leaf name without full path resolution).
439pub fn subvolume_list(fd: BorrowedFd) -> nix::Result<Vec<SubvolumeListItem>> {
440    let mut items: Vec<SubvolumeListItem> = Vec::new();
441
442    tree_search(
443        fd,
444        SearchKey::for_objectid_range(
445            BTRFS_ROOT_TREE_OBJECTID as u64,
446            BTRFS_ROOT_ITEM_KEY,
447            BTRFS_FIRST_FREE_OBJECTID as u64,
448            BTRFS_LAST_FREE_OBJECTID as u64,
449        ),
450        |hdr, data| {
451            if let Some(item) = parse_root_item(hdr.objectid, data) {
452                items.push(item);
453            }
454            Ok(())
455        },
456    )?;
457
458    tree_search(
459        fd,
460        SearchKey::for_objectid_range(
461            BTRFS_ROOT_TREE_OBJECTID as u64,
462            BTRFS_ROOT_BACKREF_KEY,
463            BTRFS_FIRST_FREE_OBJECTID as u64,
464            BTRFS_LAST_FREE_OBJECTID as u64,
465        ),
466        |hdr, data| {
467            // ROOT_BACKREF_KEY: objectid = subvol root_id, offset = parent root_id
468            let root_id = hdr.objectid;
469            let parent_id = hdr.offset;
470
471            if let Some(item) = items.iter_mut().find(|i| i.root_id == root_id)
472            {
473                // Only take the first ROOT_BACKREF for each subvolume.  A
474                // subvolume can have multiple ROOT_BACKREF entries when it is
475                // referenced from more than one parent (e.g. the subvolume
476                // also appears as a snapshot inside another subvolume).
477                // Items are returned in offset-ascending order, so the first
478                // entry has the smallest parent_id — which is the canonical
479                // location btrfs-progs uses for "top level" and path.
480                if item.parent_id == 0 {
481                    item.parent_id = parent_id;
482                    if let Some((dir_id, name)) = parse_root_ref(data) {
483                        item.dir_id = dir_id;
484                        item.name = name;
485                    }
486                }
487            }
488            Ok(())
489        },
490    )?;
491
492    // Determine which subvolume the fd is open on.  Paths are expressed
493    // relative to this subvolume, matching btrfs-progs behaviour.
494    let top_id =
495        crate::inode::lookup_path_rootid(fd).unwrap_or(FS_TREE_OBJECTID);
496
497    resolve_full_paths(fd, &mut items, top_id)?;
498
499    Ok(items)
500}
501
502/// Call `BTRFS_IOC_INO_LOOKUP` for `dir_id` within `parent_tree` and return
503/// the path from that tree's root to the directory.
504///
505/// The kernel returns the path with a trailing `/` when the directory is not
506/// the tree root, and an empty string when `dir_id` is the tree root itself.
507/// This prefix can be concatenated directly with the subvolume's leaf name to
508/// form the full segment within the parent.
509fn ino_lookup_dir_path(
510    fd: BorrowedFd,
511    parent_tree: u64,
512    dir_id: u64,
513) -> nix::Result<String> {
514    let mut args = btrfs_ioctl_ino_lookup_args {
515        treeid: parent_tree,
516        objectid: dir_id,
517        ..unsafe { mem::zeroed() }
518    };
519    // SAFETY: args is a valid, zeroed btrfs_ioctl_ino_lookup_args; the ioctl
520    // fills in args.name with a null-terminated path string.
521    unsafe { btrfs_ioc_ino_lookup(fd.as_raw_fd(), &mut args) }?;
522
523    // args.name is [c_char; 4080]; find the null terminator and decode.
524    let name_ptr: *const c_char = args.name.as_ptr();
525    // SAFETY: the ioctl guarantees null termination within the 4080-byte buffer.
526    let cstr = unsafe { CStr::from_ptr(name_ptr) };
527    Ok(cstr.to_string_lossy().into_owned())
528}
529
530/// Resolve the `name` field of every item in `items` from a bare leaf name to
531/// the full path relative to the filesystem root.
532///
533/// For each subvolume we already have `parent_id`, `dir_id`, and the leaf name
534/// from the ROOT_BACKREF pass.  A single `BTRFS_IOC_INO_LOOKUP` call per item
535/// gives the path from the parent tree's root down to the directory that
536/// contains the subvolume (the "dir prefix").  Concatenating that prefix with
537/// the leaf name yields the subvolume's segment within its parent.  Walking up
538/// the parent chain (using the in-memory items map) and joining segments with
539/// `/` gives the final full path.
540fn resolve_full_paths(
541    fd: BorrowedFd,
542    items: &mut [SubvolumeListItem],
543    top_id: u64,
544) -> nix::Result<()> {
545    // Map root_id → index for O(1) parent lookups.
546    let id_to_idx: HashMap<u64, usize> = items
547        .iter()
548        .enumerate()
549        .map(|(i, item)| (item.root_id, i))
550        .collect();
551
552    // Compute the "segment" for each item: the path of this subvolume within
553    // its immediate parent (dir prefix from INO_LOOKUP + leaf name).
554    // If INO_LOOKUP is not available (e.g. missing CAP_SYS_ADMIN), fall back
555    // to the bare leaf name so the list still works.
556    let segments: Vec<String> = items
557        .iter()
558        .map(|item| {
559            if item.parent_id == 0 || item.name.is_empty() {
560                return item.name.clone();
561            }
562            match ino_lookup_dir_path(fd, item.parent_id, item.dir_id) {
563                Ok(prefix) => format!("{}{}", prefix, item.name),
564                Err(_) => item.name.clone(),
565            }
566        })
567        .collect();
568
569    // Walk each item's parent chain, joining segments, and cache results so
570    // every ancestor is resolved at most once.
571    let mut full_paths: HashMap<u64, String> = HashMap::new();
572    let root_ids: Vec<u64> = items.iter().map(|i| i.root_id).collect();
573    for root_id in root_ids {
574        build_full_path(
575            root_id,
576            top_id,
577            &id_to_idx,
578            &segments,
579            items,
580            &mut full_paths,
581        );
582    }
583
584    for item in items.iter_mut() {
585        if let Some(path) = full_paths.remove(&item.root_id) {
586            item.name = path;
587        }
588    }
589
590    Ok(())
591}
592
593/// Compute the full path for `root_id`, memoizing into `cache`.
594///
595/// Walks the ancestor chain iteratively to avoid stack overflow on deep
596/// subvolume trees.  Collects segments from the target up to the FS tree
597/// root, then joins them in reverse order.
598///
599/// Cycle detection is included: ROOT_BACKREF entries can form mutual parent
600/// relationships (e.g. a snapshot stored inside the subvolume it was taken
601/// from), which would otherwise loop forever.
602fn build_full_path(
603    root_id: u64,
604    top_id: u64,
605    id_to_idx: &HashMap<u64, usize>,
606    segments: &[String],
607    items: &[SubvolumeListItem],
608    cache: &mut HashMap<u64, String>,
609) -> String {
610    // Collect the chain of root_ids from `root_id` up to the top subvolume
611    // (the one the fd is open on) or the FS tree root, stopping early if we
612    // hit an already-cached ancestor or a cycle.
613    let mut chain: Vec<u64> = Vec::new();
614    let mut visited: HashMap<u64, usize> = HashMap::new();
615    let mut cur = root_id;
616    loop {
617        if cache.contains_key(&cur) {
618            break;
619        }
620        if visited.contains_key(&cur) {
621            // Cycle detected: truncate the chain back to where the cycle
622            // starts so we don't join nonsensical repeated segments.
623            let cycle_start = visited[&cur];
624            chain.truncate(cycle_start);
625            break;
626        }
627        let Some(&idx) = id_to_idx.get(&cur) else {
628            break;
629        };
630        visited.insert(cur, chain.len());
631        chain.push(cur);
632        let parent = items[idx].parent_id;
633        if parent == 0
634            || parent == FS_TREE_OBJECTID
635            || parent == top_id
636            || !id_to_idx.contains_key(&parent)
637        {
638            break;
639        }
640        cur = parent;
641    }
642
643    // Resolve each node in the chain from root toward the target, building
644    // on any already-cached prefix we stopped at.
645    for &id in chain.iter().rev() {
646        let Some(&idx) = id_to_idx.get(&id) else {
647            cache.insert(id, String::new());
648            continue;
649        };
650        let segment = &segments[idx];
651        let parent_id = items[idx].parent_id;
652
653        let full_path = if parent_id == 0
654            || parent_id == FS_TREE_OBJECTID
655            || parent_id == top_id
656            || !id_to_idx.contains_key(&parent_id)
657        {
658            segment.clone()
659        } else if let Some(parent_path) = cache.get(&parent_id) {
660            if parent_path.is_empty() {
661                segment.clone()
662            } else {
663                format!("{}/{}", parent_path, segment)
664            }
665        } else {
666            segment.clone()
667        };
668
669        cache.insert(id, full_path);
670    }
671
672    cache.get(&root_id).cloned().unwrap_or_default()
673}
674
675/// `btrfs_root_item` field offsets (packed, LE).
676fn parse_root_item(root_id: u64, data: &[u8]) -> Option<SubvolumeListItem> {
677    use std::mem::offset_of;
678
679    // Items shorter than generation_v2 are "legacy" and do not carry
680    // UUID / otime / otransid fields.
681    let legacy_boundary = offset_of!(btrfs_root_item, generation_v2);
682    if data.len() < legacy_boundary {
683        return None;
684    }
685
686    let generation = rle64(data, offset_of!(btrfs_root_item, generation));
687    let flags_raw = rle64(data, offset_of!(btrfs_root_item, flags));
688    let flags = SubvolumeFlags::from_bits_truncate(flags_raw);
689
690    // Extended fields exist only in non-legacy items.
691    let otime_nsec =
692        offset_of!(btrfs_root_item, otime) + offset_of!(btrfs_timespec, nsec);
693    let (uuid, parent_uuid, received_uuid, otransid, otime) =
694        if data.len() >= otime_nsec + field_size!(btrfs_timespec, nsec) {
695            let off_uuid = offset_of!(btrfs_root_item, uuid);
696            let off_parent = offset_of!(btrfs_root_item, parent_uuid);
697            let off_received = offset_of!(btrfs_root_item, received_uuid);
698            let uuid_size = field_size!(btrfs_root_item, uuid);
699            let uuid = Uuid::from_bytes(
700                data[off_uuid..off_uuid + uuid_size].try_into().unwrap(),
701            );
702            let parent_uuid = Uuid::from_bytes(
703                data[off_parent..off_parent + uuid_size].try_into().unwrap(),
704            );
705            let received_uuid = Uuid::from_bytes(
706                data[off_received..off_received + uuid_size]
707                    .try_into()
708                    .unwrap(),
709            );
710            let otransid = rle64(data, offset_of!(btrfs_root_item, otransid));
711            let otime_sec = offset_of!(btrfs_root_item, otime);
712            let otime = timespec_to_system_time(
713                rle64(data, otime_sec),
714                rle32(data, otime_nsec),
715            );
716            (uuid, parent_uuid, received_uuid, otransid, otime)
717        } else {
718            (Uuid::nil(), Uuid::nil(), Uuid::nil(), 0, UNIX_EPOCH)
719        };
720
721    Some(SubvolumeListItem {
722        root_id,
723        parent_id: 0,
724        dir_id: 0,
725        generation,
726        flags,
727        uuid,
728        parent_uuid,
729        received_uuid,
730        otransid,
731        otime,
732        name: String::new(),
733    })
734}
735
736/// Parse a `btrfs_root_ref` payload (packed, LE). The name immediately
737/// follows the fixed-size header.
738fn parse_root_ref(data: &[u8]) -> Option<(u64, String)> {
739    use crate::raw::btrfs_root_ref;
740    use std::mem::{offset_of, size_of};
741
742    let header_size = size_of::<btrfs_root_ref>();
743    if data.len() < header_size {
744        return None;
745    }
746    let dir_id = rle64(data, offset_of!(btrfs_root_ref, dirid));
747    let name_off = offset_of!(btrfs_root_ref, name_len);
748    let name_len =
749        u16::from_le_bytes([data[name_off], data[name_off + 1]]) as usize;
750    if data.len() < header_size + name_len {
751        return None;
752    }
753    let name =
754        String::from_utf8_lossy(&data[header_size..header_size + name_len])
755            .into_owned();
756    Some((dir_id, name))
757}
758
759#[inline]
760fn rle64(buf: &[u8], off: usize) -> u64 {
761    u64::from_le_bytes(buf[off..off + 8].try_into().unwrap())
762}
763
764#[inline]
765fn rle32(buf: &[u8], off: usize) -> u32 {
766    u32::from_le_bytes(buf[off..off + 4].try_into().unwrap())
767}
768
769/// Convert an on-disk `btrfs_timespec` (LE sec + LE nsec, packed) to
770/// [`SystemTime`].  Returns [`UNIX_EPOCH`] if sec == 0.
771fn timespec_to_system_time(sec: u64, nsec: u32) -> SystemTime {
772    if sec == 0 {
773        return UNIX_EPOCH;
774    }
775    UNIX_EPOCH + Duration::new(sec, nsec)
776}
777
778/// Convert a `btrfs_ioctl_timespec` (host byte order, with padding) to
779/// [`SystemTime`].  Returns [`UNIX_EPOCH`] if sec == 0.
780fn ioctl_timespec_to_system_time(sec: u64, nsec: u32) -> SystemTime {
781    if sec == 0 {
782        return UNIX_EPOCH;
783    }
784    UNIX_EPOCH + Duration::new(sec, nsec)
785}
786
787/// A child subvolume reference returned by [`subvol_rootrefs`].
788#[derive(Debug, Clone, Copy, PartialEq, Eq)]
789pub struct SubvolRootRef {
790    /// Root ID (tree ID) of the child subvolume.
791    pub treeid: u64,
792    /// Directory inode ID where the child is attached in the parent.
793    pub dirid: u64,
794}
795
796/// List the child subvolumes directly under the subvolume opened by `fd`.
797///
798/// Returns all subvolumes whose root is referenced from the given
799/// subvolume. The kernel returns results in batches of up to 255; this
800/// function handles continuation automatically by advancing `min_treeid`.
801///
802/// Does not require `CAP_SYS_ADMIN`.
803///
804/// Errors: ENOTTY on kernels older than 4.18.
805pub fn subvol_rootrefs(fd: BorrowedFd) -> nix::Result<Vec<SubvolRootRef>> {
806    use crate::raw::{
807        btrfs_ioc_get_subvol_rootref, btrfs_ioctl_get_subvol_rootref_args,
808    };
809
810    let mut results = Vec::new();
811    let mut min_treeid: u64 = 0;
812
813    loop {
814        let mut args: btrfs_ioctl_get_subvol_rootref_args =
815            unsafe { std::mem::zeroed() };
816        args.min_treeid = min_treeid;
817
818        let ret =
819            unsafe { btrfs_ioc_get_subvol_rootref(fd.as_raw_fd(), &mut args) };
820
821        // The kernel returns EOVERFLOW when there are more results than
822        // fit in one batch. We read what we got and loop with the updated
823        // min_treeid.
824        let overflow = match ret {
825            Ok(_) => false,
826            Err(nix::errno::Errno::EOVERFLOW) => true,
827            Err(e) => return Err(e),
828        };
829
830        let count = args.num_items as usize;
831        for i in 0..count {
832            let r = &args.rootref[i];
833            results.push(SubvolRootRef {
834                treeid: r.treeid,
835                dirid: r.dirid,
836            });
837        }
838
839        if !overflow || count == 0 {
840            break;
841        }
842
843        // Advance past the last returned treeid for the next batch.
844        min_treeid = args.rootref[count - 1].treeid + 1;
845    }
846
847    Ok(results)
848}
849
850/// Wait for a specific deleted subvolume to be fully cleaned by the kernel.
851///
852/// Blocks until the background cleaner has finished removing the on-disk
853/// data for the given subvolume ID. Returns `Ok(())` both when the wait
854/// completes and when the subvolume is already gone (`ENOENT`).
855/// Useful after `subvolume_delete` when subsequent operations depend on
856/// the subvolume being fully gone (e.g. qgroup staleness checks).
857pub fn subvol_sync_wait_one(fd: BorrowedFd, subvolid: u64) -> nix::Result<()> {
858    let args = btrfs_ioctl_subvol_wait {
859        subvolid,
860        mode: BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE,
861        count: 0,
862    };
863    match unsafe { btrfs_ioc_subvol_sync_wait(fd.as_raw_fd(), &args) } {
864        Ok(_) | Err(nix::errno::Errno::ENOENT) => Ok(()),
865        Err(e) => Err(e),
866    }
867}
868
869/// Wait for all currently queued subvolume deletions to complete.
870///
871/// Blocks until every subvolume that was in the deletion queue at the time
872/// of the call has been fully cleaned. Does not wait for subvolumes
873/// deleted after the call is made.
874pub fn subvol_sync_wait_all(fd: BorrowedFd) -> nix::Result<()> {
875    let args = btrfs_ioctl_subvol_wait {
876        subvolid: 0,
877        mode: BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED,
878        count: 0,
879    };
880    unsafe { btrfs_ioc_subvol_sync_wait(fd.as_raw_fd(), &args) }?;
881    Ok(())
882}
883
884#[cfg(test)]
885mod tests {
886    use super::*;
887    use std::{
888        collections::HashMap,
889        time::{Duration, UNIX_EPOCH},
890    };
891    use uuid::Uuid;
892
893    fn test_item(root_id: u64, parent_id: u64) -> SubvolumeListItem {
894        SubvolumeListItem {
895            root_id,
896            parent_id,
897            dir_id: 0,
898            generation: 0,
899            flags: SubvolumeFlags::empty(),
900            uuid: Uuid::nil(),
901            parent_uuid: Uuid::nil(),
902            received_uuid: Uuid::nil(),
903            otransid: 0,
904            otime: UNIX_EPOCH,
905            name: String::new(),
906        }
907    }
908
909    #[test]
910    fn rle64_reads_little_endian() {
911        let buf = [0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08];
912        assert_eq!(rle64(&buf, 0), 0x0807060504030201);
913    }
914
915    #[test]
916    fn rle64_at_offset() {
917        let buf = [0xFF, 0xFF, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00];
918        assert_eq!(rle64(&buf, 2), 1);
919    }
920
921    #[test]
922    fn rle32_reads_little_endian() {
923        let buf = [0x78, 0x56, 0x34, 0x12];
924        assert_eq!(rle32(&buf, 0), 0x12345678);
925    }
926
927    #[test]
928    fn rle32_at_offset() {
929        let buf = [0x00, 0x00, 0x01, 0x00, 0x00, 0x00];
930        assert_eq!(rle32(&buf, 2), 1);
931    }
932
933    #[test]
934    fn timespec_zero_returns_epoch() {
935        assert_eq!(timespec_to_system_time(0, 0), UNIX_EPOCH);
936    }
937
938    #[test]
939    fn timespec_zero_sec_with_nonzero_nsec_returns_epoch() {
940        // sec==0 triggers the early return regardless of nsec
941        assert_eq!(timespec_to_system_time(0, 500_000_000), UNIX_EPOCH);
942    }
943
944    #[test]
945    fn timespec_nonzero_returns_correct_time() {
946        let t = timespec_to_system_time(1000, 500);
947        assert_eq!(t, UNIX_EPOCH + Duration::new(1000, 500));
948    }
949
950    #[test]
951    fn subvolume_flags_display_readonly() {
952        let flags = SubvolumeFlags::RDONLY;
953        assert_eq!(format!("{}", flags), "readonly");
954    }
955
956    #[test]
957    fn subvolume_flags_display_empty() {
958        let flags = SubvolumeFlags::empty();
959        assert_eq!(format!("{}", flags), "-");
960    }
961
962    #[test]
963    fn parse_root_ref_valid() {
964        // btrfs_root_ref: dirid (8 LE) + sequence (8 LE) + name_len (2 LE) + name bytes
965        let name = b"mysubvol";
966        let mut buf = Vec::new();
967        buf.extend_from_slice(&42u64.to_le_bytes()); // dirid
968        buf.extend_from_slice(&1u64.to_le_bytes()); // sequence
969        buf.extend_from_slice(&(name.len() as u16).to_le_bytes()); // name_len
970        buf.extend_from_slice(name);
971
972        let result = parse_root_ref(&buf);
973        assert!(result.is_some());
974        let (dir_id, parsed_name) = result.unwrap();
975        assert_eq!(dir_id, 42);
976        assert_eq!(parsed_name, "mysubvol");
977    }
978
979    #[test]
980    fn parse_root_ref_too_short_header() {
981        // Less than 18 bytes (sizeof btrfs_root_ref)
982        let buf = [0u8; 10];
983        assert!(parse_root_ref(&buf).is_none());
984    }
985
986    #[test]
987    fn parse_root_ref_too_short_name() {
988        // Header claims 10-byte name but buffer only has the header
989        let mut buf = vec![0u8; 18];
990        // Set name_len = 10 at offset 16
991        buf[16] = 10;
992        buf[17] = 0;
993        assert!(parse_root_ref(&buf).is_none());
994    }
995
996    #[test]
997    fn parse_root_ref_empty_name() {
998        let mut buf = Vec::new();
999        buf.extend_from_slice(&100u64.to_le_bytes()); // dirid
1000        buf.extend_from_slice(&0u64.to_le_bytes()); // sequence
1001        buf.extend_from_slice(&0u16.to_le_bytes()); // name_len = 0
1002
1003        let result = parse_root_ref(&buf);
1004        assert!(result.is_some());
1005        let (dir_id, parsed_name) = result.unwrap();
1006        assert_eq!(dir_id, 100);
1007        assert_eq!(parsed_name, "");
1008    }
1009
1010    #[test]
1011    fn build_full_path_single_subvol_parent_fs_tree() {
1012        // Subvolume 256, parent is FS_TREE (5)
1013        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1014        let segments = vec!["mysub".to_string()];
1015        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1016        let mut cache = HashMap::new();
1017
1018        let path = build_full_path(
1019            256,
1020            FS_TREE_OBJECTID,
1021            &id_to_idx,
1022            &segments,
1023            &items,
1024            &mut cache,
1025        );
1026        assert_eq!(path, "mysub");
1027    }
1028
1029    #[test]
1030    fn build_full_path_nested_chain() {
1031        // A (256) -> B (257) -> C (258), all parented under FS_TREE
1032        let items = vec![
1033            test_item(256, FS_TREE_OBJECTID),
1034            test_item(257, 256),
1035            test_item(258, 257),
1036        ];
1037        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1038        let id_to_idx: HashMap<u64, usize> =
1039            [(256, 0), (257, 1), (258, 2)].into();
1040        let mut cache = HashMap::new();
1041
1042        let path = build_full_path(
1043            258,
1044            FS_TREE_OBJECTID,
1045            &id_to_idx,
1046            &segments,
1047            &items,
1048            &mut cache,
1049        );
1050        assert_eq!(path, "A/B/C");
1051    }
1052
1053    #[test]
1054    fn build_full_path_stops_at_top_id() {
1055        // A (256) -> B (257) -> C (258), top_id = 257 (B)
1056        // Paths are relative to top_id, so C's parent (257) == top_id means
1057        // C's path is just its own segment, not "A/B/C".
1058        let items = vec![
1059            test_item(256, FS_TREE_OBJECTID),
1060            test_item(257, 256),
1061            test_item(258, 257),
1062        ];
1063        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1064        let id_to_idx: HashMap<u64, usize> =
1065            [(256, 0), (257, 1), (258, 2)].into();
1066        let mut cache = HashMap::new();
1067
1068        let path = build_full_path(
1069            258, 257, &id_to_idx, &segments, &items, &mut cache,
1070        );
1071        assert_eq!(path, "C");
1072
1073        // B's path is also just "B" (its parent 256/A is below top_id in the
1074        // tree, but B's own parent is not top_id — A's parent is FS_TREE).
1075        // With top_id=257, building B: parent=256, 256 is in id_to_idx but
1076        // 256's parent is FS_TREE (5) which triggers the stop, so chain = [257, 256],
1077        // and A gets its segment, B gets "A/B".
1078        let path_b = build_full_path(
1079            257, 257, &id_to_idx, &segments, &items, &mut cache,
1080        );
1081        // 257 itself: its parent is 256, 256 != top_id (257), so we walk up.
1082        // 256's parent is FS_TREE (5), which triggers stop. chain = [257, 256].
1083        // 256 resolves to "A" (parent is FS_TREE), 257 resolves to "A/B".
1084        assert_eq!(path_b, "A/B");
1085    }
1086
1087    #[test]
1088    fn build_full_path_cycle_detection() {
1089        // A (256) parent is B (257), B (257) parent is A (256) — mutual cycle
1090        let items = vec![test_item(256, 257), test_item(257, 256)];
1091        let segments = vec!["A".to_string(), "B".to_string()];
1092        let id_to_idx: HashMap<u64, usize> = [(256, 0), (257, 1)].into();
1093        let mut cache = HashMap::new();
1094
1095        // Must not hang. The result is truncated due to cycle detection.
1096        let _path = build_full_path(
1097            256,
1098            FS_TREE_OBJECTID,
1099            &id_to_idx,
1100            &segments,
1101            &items,
1102            &mut cache,
1103        );
1104        // Just verify it terminates and returns something (exact value depends
1105        // on cycle truncation heuristic).
1106    }
1107
1108    #[test]
1109    fn build_full_path_cached_ancestor() {
1110        // A (256) -> B (257) -> C (258)
1111        // Pre-cache B's path; building C should use it.
1112        let items = vec![
1113            test_item(256, FS_TREE_OBJECTID),
1114            test_item(257, 256),
1115            test_item(258, 257),
1116        ];
1117        let segments = vec!["A".to_string(), "B".to_string(), "C".to_string()];
1118        let id_to_idx: HashMap<u64, usize> =
1119            [(256, 0), (257, 1), (258, 2)].into();
1120        let mut cache = HashMap::new();
1121        cache.insert(257, "A/B".to_string());
1122
1123        let path = build_full_path(
1124            258,
1125            FS_TREE_OBJECTID,
1126            &id_to_idx,
1127            &segments,
1128            &items,
1129            &mut cache,
1130        );
1131        assert_eq!(path, "A/B/C");
1132    }
1133
1134    #[test]
1135    fn build_full_path_unknown_parent() {
1136        // Subvolume 256, parent 999 not in id_to_idx
1137        let items = vec![test_item(256, 999)];
1138        let segments = vec!["orphan".to_string()];
1139        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1140        let mut cache = HashMap::new();
1141
1142        let path = build_full_path(
1143            256,
1144            FS_TREE_OBJECTID,
1145            &id_to_idx,
1146            &segments,
1147            &items,
1148            &mut cache,
1149        );
1150        assert_eq!(path, "orphan");
1151    }
1152
1153    #[test]
1154    fn build_full_path_parent_id_zero() {
1155        // Subvolume 256, parent_id == 0 (no backref found)
1156        let items = vec![test_item(256, 0)];
1157        let segments = vec!["noparent".to_string()];
1158        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1159        let mut cache = HashMap::new();
1160
1161        let path = build_full_path(
1162            256,
1163            FS_TREE_OBJECTID,
1164            &id_to_idx,
1165            &segments,
1166            &items,
1167            &mut cache,
1168        );
1169        assert_eq!(path, "noparent");
1170    }
1171
1172    #[test]
1173    fn build_full_path_already_cached_target() {
1174        // If the target itself is already cached, return the cached value.
1175        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1176        let segments = vec!["A".to_string()];
1177        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1178        let mut cache = HashMap::new();
1179        cache.insert(256, "cached/path".to_string());
1180
1181        let path = build_full_path(
1182            256,
1183            FS_TREE_OBJECTID,
1184            &id_to_idx,
1185            &segments,
1186            &items,
1187            &mut cache,
1188        );
1189        assert_eq!(path, "cached/path");
1190    }
1191
1192    #[test]
1193    fn build_full_path_root_id_not_in_items() {
1194        // root_id not present in id_to_idx at all
1195        let items = vec![test_item(256, FS_TREE_OBJECTID)];
1196        let segments = vec!["A".to_string()];
1197        let id_to_idx: HashMap<u64, usize> = [(256, 0)].into();
1198        let mut cache = HashMap::new();
1199
1200        let path = build_full_path(
1201            999,
1202            FS_TREE_OBJECTID,
1203            &id_to_idx,
1204            &segments,
1205            &items,
1206            &mut cache,
1207        );
1208        assert_eq!(path, "");
1209    }
1210}