Skip to main content

btrfs_uapi/
device.rs

1//! # Device management: adding, removing, querying, and extent layout
2//!
3//! Covers adding and removing devices from a mounted filesystem, scanning a
4//! device to register it with the kernel, querying per-device I/O error
5//! statistics, checking whether all devices of a multi-device filesystem
6//! are present and ready, and computing minimum device sizes from the
7//! device extent tree.
8//!
9//! Most operations require `CAP_SYS_ADMIN`.
10
11use crate::{
12    filesystem::FilesystemInfo,
13    raw::{
14        BTRFS_DEV_EXTENT_KEY, BTRFS_DEV_STATS_RESET, BTRFS_DEV_TREE_OBJECTID,
15        BTRFS_DEVICE_SPEC_BY_ID,
16        btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS,
17        btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS,
18        btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS,
19        btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS,
20        btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX,
21        btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS, btrfs_ioc_add_dev,
22        btrfs_ioc_dev_info, btrfs_ioc_devices_ready, btrfs_ioc_forget_dev,
23        btrfs_ioc_get_dev_stats, btrfs_ioc_rm_dev, btrfs_ioc_rm_dev_v2,
24        btrfs_ioc_scan_dev, btrfs_ioctl_dev_info_args,
25        btrfs_ioctl_get_dev_stats, btrfs_ioctl_vol_args,
26        btrfs_ioctl_vol_args_v2,
27    },
28    tree_search::{SearchFilter, tree_search},
29};
30use nix::{errno::Errno, libc::c_char};
31use std::{
32    ffi::CStr,
33    fs::OpenOptions,
34    mem,
35    os::{fd::AsRawFd, unix::io::BorrowedFd},
36};
37use uuid::Uuid;
38
39/// Information about a single device within a btrfs filesystem, as returned
40/// by `BTRFS_IOC_DEV_INFO`.
41#[derive(Debug, Clone)]
42pub struct DeviceInfo {
43    /// Device ID.
44    pub devid: u64,
45    /// Device UUID.
46    pub uuid: Uuid,
47    /// Number of bytes used on this device.
48    pub bytes_used: u64,
49    /// Total size of this device in bytes.
50    pub total_bytes: u64,
51    /// Path to the block device, e.g. `/dev/sda`.
52    pub path: String,
53}
54
55/// Specifies a device for operations that can address by either path or ID.
56#[derive(Debug, Clone)]
57pub enum DeviceSpec<'a> {
58    /// A block device path (e.g. `/dev/sdb`), or the special strings
59    /// `"missing"` or `"cancel"` accepted by the remove ioctl.
60    Path(&'a CStr),
61    /// A btrfs device ID as reported by `BTRFS_IOC_DEV_INFO`.
62    Id(u64),
63}
64
65/// Per-device I/O error statistics, as returned by `BTRFS_IOC_GET_DEV_STATS`.
66#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
67pub struct DeviceStats {
68    /// Device ID these stats belong to.
69    pub devid: u64,
70    /// Number of write I/O errors (EIO/EREMOTEIO from lower layers).
71    pub write_errs: u64,
72    /// Number of read I/O errors (EIO/EREMOTEIO from lower layers).
73    pub read_errs: u64,
74    /// Number of flush I/O errors (EIO/EREMOTEIO from lower layers).
75    pub flush_errs: u64,
76    /// Number of checksum or bytenr corruption errors detected on read.
77    pub corruption_errs: u64,
78    /// Number of generation errors (blocks not written where expected).
79    pub generation_errs: u64,
80}
81
82impl DeviceStats {
83    /// Sum of all error counters.
84    #[must_use]
85    pub fn total_errs(&self) -> u64 {
86        self.write_errs
87            + self.read_errs
88            + self.flush_errs
89            + self.corruption_errs
90            + self.generation_errs
91    }
92
93    /// Returns `true` if every counter is zero.
94    #[must_use]
95    pub fn is_clean(&self) -> bool {
96        self.total_errs() == 0
97    }
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    #[test]
105    fn dev_stats_default_is_clean() {
106        let stats = DeviceStats::default();
107        assert!(stats.is_clean());
108        assert_eq!(stats.total_errs(), 0);
109    }
110
111    #[test]
112    fn dev_stats_total_errs() {
113        let stats = DeviceStats {
114            devid: 1,
115            write_errs: 1,
116            read_errs: 2,
117            flush_errs: 3,
118            corruption_errs: 4,
119            generation_errs: 5,
120        };
121        assert_eq!(stats.total_errs(), 15);
122        assert!(!stats.is_clean());
123    }
124
125    #[test]
126    fn dev_stats_single_error_not_clean() {
127        let stats = DeviceStats {
128            corruption_errs: 1,
129            ..DeviceStats::default()
130        };
131        assert!(!stats.is_clean());
132        assert_eq!(stats.total_errs(), 1);
133    }
134}
135
136/// Copy the bytes of `path` (without the nul terminator) into `name`,
137/// returning `ENAMETOOLONG` if the path (including the terminator that the
138/// kernel expects to already be present via zeroing) does not fit.
139#[allow(clippy::cast_possible_wrap)] // ASCII bytes always fit in c_char
140fn copy_path_to_name(name: &mut [c_char], path: &CStr) -> nix::Result<()> {
141    let bytes = path.to_bytes(); // excludes nul terminator
142    if bytes.len() >= name.len() {
143        return Err(Errno::ENAMETOOLONG);
144    }
145    for (i, &b) in bytes.iter().enumerate() {
146        name[i] = b as c_char;
147    }
148    // The remainder of `name` is already zeroed by the caller (mem::zeroed).
149    Ok(())
150}
151
152/// Open `/dev/btrfs-control` for read+write, mapping any `std::io::Error` to
153/// the appropriate `nix::errno::Errno`.
154fn open_control() -> nix::Result<std::fs::File> {
155    OpenOptions::new()
156        .read(true)
157        .write(true)
158        .open("/dev/btrfs-control")
159        .map_err(|e| {
160            Errno::from_raw(e.raw_os_error().unwrap_or(nix::libc::ENODEV))
161        })
162}
163
164/// Query information about the device with the given `devid` on the filesystem
165/// referred to by `fd`.
166///
167/// Returns `None` if no device with that ID exists (`ENODEV`).
168///
169/// # Errors
170///
171/// Returns `Err` if the ioctl fails (other than `ENODEV`).
172pub fn device_info(
173    fd: BorrowedFd,
174    devid: u64,
175) -> nix::Result<Option<DeviceInfo>> {
176    let mut raw: btrfs_ioctl_dev_info_args = unsafe { mem::zeroed() };
177    raw.devid = devid;
178
179    match unsafe { btrfs_ioc_dev_info(fd.as_raw_fd(), &raw mut raw) } {
180        Err(Errno::ENODEV) => return Ok(None),
181        Err(e) => return Err(e),
182        Ok(_) => {}
183    }
184
185    let path = unsafe { CStr::from_ptr(raw.path.as_ptr().cast()) }
186        .to_string_lossy()
187        .into_owned();
188
189    Ok(Some(DeviceInfo {
190        devid: raw.devid,
191        uuid: Uuid::from_bytes(raw.uuid),
192        bytes_used: raw.bytes_used,
193        total_bytes: raw.total_bytes,
194        path,
195    }))
196}
197
198/// Query information about all devices in the filesystem referred to by `fd`,
199/// using the device count from a previously obtained [`FilesystemInfo`].
200///
201/// Iterates devids `1..=max_id`, skipping any that return `ENODEV` (holes in
202/// the devid space are normal when devices have been removed).
203///
204/// # Errors
205///
206/// Returns `Err` if any device info ioctl fails.
207pub fn device_info_all(
208    fd: BorrowedFd,
209    fs_info: &FilesystemInfo,
210) -> nix::Result<Vec<DeviceInfo>> {
211    #[allow(clippy::cast_possible_truncation)]
212    // device count always fits in usize
213    let mut devices = Vec::with_capacity(fs_info.num_devices as usize);
214    for devid in 1..=fs_info.max_id {
215        if let Some(info) = device_info(fd, devid)? {
216            devices.push(info);
217        }
218    }
219    Ok(devices)
220}
221
222/// Add a device to the btrfs filesystem referred to by `fd`.
223///
224/// `path` must be the path to an unmounted block device. The kernel requires
225/// `CAP_SYS_ADMIN`.
226///
227/// # Errors
228///
229/// Returns `Err` if the ioctl fails.
230pub fn device_add(fd: BorrowedFd, path: &CStr) -> nix::Result<()> {
231    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
232    copy_path_to_name(&mut raw.name, path)?;
233    unsafe { btrfs_ioc_add_dev(fd.as_raw_fd(), &raw const raw) }?;
234    Ok(())
235}
236
237/// Remove a device from the btrfs filesystem referred to by `fd`.
238///
239/// The device can be specified either by path or by its btrfs device ID via
240/// [`DeviceSpec`]. Uses `BTRFS_IOC_RM_DEV_V2` and falls back to the older
241/// `BTRFS_IOC_RM_DEV` ioctl on kernels that do not support the v2 variant
242/// (only possible when removing by path). The kernel requires `CAP_SYS_ADMIN`.
243///
244/// Errors: ENOTTY or EOPNOTSUPP from `RM_DEV_V2` triggers an automatic
245/// fallback to the v1 ioctl (path-based removal only; by-ID removal
246/// requires v2 and will propagate the error).  `EBUSY` if the device holds
247/// the only copy of some data and cannot be removed.
248///
249/// # Errors
250///
251/// Returns `Err` if the remove ioctl fails.
252pub fn device_remove(fd: BorrowedFd, spec: &DeviceSpec<'_>) -> nix::Result<()> {
253    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
254
255    match *spec {
256        DeviceSpec::Id(devid) => {
257            args.flags = u64::from(BTRFS_DEVICE_SPEC_BY_ID);
258            // SAFETY: devid is the active union member when BTRFS_DEVICE_SPEC_BY_ID is set.
259            args.__bindgen_anon_2.devid = devid;
260            unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &raw const args) }?;
261        }
262        DeviceSpec::Path(path) => {
263            // SAFETY: name is the active union member when flags == 0.
264            unsafe {
265                copy_path_to_name(&mut args.__bindgen_anon_2.name, path)
266            }?;
267            match unsafe {
268                btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &raw const args)
269            } {
270                Ok(_) => {}
271                // Fall back to the old single-arg ioctl on kernels that either
272                // don't know about v2 (ENOTTY) or don't recognise our flags (EOPNOTSUPP).
273                Err(Errno::ENOTTY | Errno::EOPNOTSUPP) => {
274                    let mut old: btrfs_ioctl_vol_args =
275                        unsafe { mem::zeroed() };
276                    copy_path_to_name(&mut old.name, path)?;
277                    unsafe {
278                        btrfs_ioc_rm_dev(fd.as_raw_fd(), &raw const old)
279                    }?;
280                }
281                Err(e) => return Err(e),
282            }
283        }
284    }
285
286    Ok(())
287}
288
289/// Register a block device with the kernel's btrfs device scanner so that
290/// multi-device filesystems containing it can be mounted.
291///
292/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_SCAN_DEV`. `path` must
293/// be the path to a block device that contains a btrfs filesystem member.
294///
295/// # Errors
296///
297/// Returns `Err` if opening `/dev/btrfs-control` or the ioctl fails.
298pub fn device_scan(path: &CStr) -> nix::Result<()> {
299    let ctl = open_control()?;
300    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
301    copy_path_to_name(&mut raw.name, path)?;
302    unsafe { btrfs_ioc_scan_dev(ctl.as_raw_fd(), &raw const raw) }?;
303    Ok(())
304}
305
306/// Unregister a device (or all stale devices) from the kernel's btrfs device
307/// scanner.
308///
309/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_FORGET_DEV`. If `path`
310/// is `None`, all devices that are not part of a currently mounted filesystem
311/// are unregistered. If `path` is `Some`, only that specific device path is
312/// unregistered.
313///
314/// # Errors
315///
316/// Returns `Err` if opening `/dev/btrfs-control` or the ioctl fails.
317pub fn device_forget(path: Option<&CStr>) -> nix::Result<()> {
318    let ctl = open_control()?;
319    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
320    if let Some(p) = path {
321        copy_path_to_name(&mut raw.name, p)?;
322    }
323    unsafe { btrfs_ioc_forget_dev(ctl.as_raw_fd(), &raw const raw) }?;
324    Ok(())
325}
326
327/// Check whether all member devices of the filesystem that contains `path`
328/// are available and the filesystem is ready to mount.
329///
330/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_DEVICES_READY`. `path`
331/// must be the path to one of the block devices belonging to the filesystem.
332/// Returns `Ok(())` when all devices are present; returns an error (typically
333/// `ENOENT` or `ENXIO`) if the set is incomplete.
334///
335/// # Errors
336///
337/// Returns `Err` if some devices are missing or the ioctl fails.
338pub fn device_ready(path: &CStr) -> nix::Result<()> {
339    let ctl = open_control()?;
340    // BTRFS_IOC_DEVICES_READY is declared _IOR but the kernel reads the device
341    // path from args.name, so we pass a mut pointer as ioctl_read! requires.
342    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
343    copy_path_to_name(&mut raw.name, path)?;
344    unsafe { btrfs_ioc_devices_ready(ctl.as_raw_fd(), &raw mut raw) }?;
345    Ok(())
346}
347
348/// Query I/O error statistics for the device identified by `devid` within the
349/// filesystem referred to by `fd`.
350///
351/// If `reset` is `true`, the kernel atomically returns the current values and
352/// then resets all counters to zero. The kernel requires `CAP_SYS_ADMIN`.
353///
354/// # Errors
355///
356/// Returns `Err` if the ioctl fails.
357pub fn device_stats(
358    fd: BorrowedFd,
359    devid: u64,
360    reset: bool,
361) -> nix::Result<DeviceStats> {
362    let mut raw: btrfs_ioctl_get_dev_stats = unsafe { mem::zeroed() };
363    raw.devid = devid;
364    raw.nr_items = u64::from(btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX);
365    if reset {
366        raw.flags = u64::from(BTRFS_DEV_STATS_RESET);
367    }
368
369    unsafe { btrfs_ioc_get_dev_stats(fd.as_raw_fd(), &raw mut raw) }?;
370
371    Ok(DeviceStats {
372        devid,
373        write_errs: raw.values
374            [btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS as usize],
375        read_errs: raw.values
376            [btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS as usize],
377        flush_errs: raw.values
378            [btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS as usize],
379        corruption_errs: raw.values
380            [btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS as usize],
381        generation_errs: raw.values
382            [btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS as usize],
383    })
384}
385
386const SZ_1M: u64 = 1024 * 1024;
387const SZ_32M: u64 = 32 * 1024 * 1024;
388
389/// Number of superblock mirror copies btrfs maintains.
390const BTRFS_SUPER_MIRROR_MAX: usize = 3;
391
392/// Return the byte offset of superblock mirror `i`.
393///
394/// Mirror 0 is at 64 KiB, mirror 1 at 64 MiB, mirror 2 at 256 GiB.
395fn sb_offset(i: usize) -> u64 {
396    match i {
397        0 => 64 * 1024,
398        _ => 1u64 << (20 + 10 * (i as u64)),
399    }
400}
401
402/// A contiguous physical byte range on a device (inclusive end).
403#[derive(Debug, Clone, Copy)]
404struct Extent {
405    start: u64,
406    /// Inclusive end byte.
407    end: u64,
408}
409
410/// Compute the minimum size to which device `devid` can be shrunk.
411///
412/// Walks the device tree for all `DEV_EXTENT_KEY` items belonging to
413/// `devid`, sums their lengths, then adjusts for extents that sit beyond
414/// the sum by checking whether they can be relocated into holes closer to
415/// the start of the device. The algorithm matches `btrfs inspect-internal
416/// min-dev-size` from btrfs-progs.
417///
418/// Requires `CAP_SYS_ADMIN`.
419///
420/// # Errors
421///
422/// Returns `Err` if the tree search ioctl fails.
423pub fn device_min_size(fd: BorrowedFd, devid: u64) -> nix::Result<u64> {
424    let mut dev_extents: Vec<(u64, u64)> = Vec::new();
425
426    tree_search(
427        fd,
428        SearchFilter::for_objectid_range(
429            u64::from(BTRFS_DEV_TREE_OBJECTID),
430            BTRFS_DEV_EXTENT_KEY,
431            devid,
432            devid,
433        ),
434        |hdr, data| {
435            let Some(de) = btrfs_disk::items::DeviceExtent::parse(data) else {
436                return Ok(());
437            };
438            dev_extents.push((hdr.offset, de.length));
439            Ok(())
440        },
441    )?;
442
443    Ok(compute_min_size(&dev_extents))
444}
445
446/// Compute the minimum device size from a list of device extents.
447///
448/// Each entry is `(physical_start, length)`. The list must be sorted by
449/// ascending `physical_start` (as returned by the device tree).
450///
451/// The algorithm sums all extent lengths (plus 1 MiB base), then tries to
452/// relocate tail extents into holes to reduce the total. Matches the
453/// btrfs-progs `min-dev-size` logic.
454#[must_use]
455pub fn compute_min_size(dev_extents: &[(u64, u64)]) -> u64 {
456    let mut min_size: u64 = SZ_1M;
457    let mut extents: Vec<Extent> = Vec::new();
458    let mut holes: Vec<Extent> = Vec::new();
459    let mut last_pos: Option<u64> = None;
460
461    for &(phys_start, len) in dev_extents {
462        min_size += len;
463
464        extents.push(Extent {
465            start: phys_start,
466            end: phys_start + len - 1,
467        });
468
469        if let Some(prev_end) = last_pos
470            && prev_end != phys_start
471        {
472            holes.push(Extent {
473                start: prev_end,
474                end: phys_start - 1,
475            });
476        }
477
478        last_pos = Some(phys_start + len);
479    }
480
481    // Sort extents by descending end offset for the adjustment pass.
482    extents.sort_by(|a, b| b.end.cmp(&a.end));
483
484    adjust_min_size(&mut extents, &mut holes, &mut min_size);
485
486    min_size
487}
488
489/// Check whether a byte range `[start, end]` contains a superblock mirror.
490fn hole_includes_sb_mirror(start: u64, end: u64) -> bool {
491    (0..BTRFS_SUPER_MIRROR_MAX).any(|i| {
492        let bytenr = sb_offset(i);
493        bytenr >= start && bytenr <= end
494    })
495}
496
497/// Adjust `min_size` downward by relocating tail extents into holes.
498///
499/// Processes extents in descending order of end offset. If an extent sits
500/// beyond the current `min_size`, try to find a hole large enough to
501/// relocate it. If no hole fits, the device cannot be shrunk past that
502/// extent and `min_size` is set to its end + 1.
503///
504/// Adds scratch space (largest relocated extent + 32 MiB for a potential
505/// system chunk allocation) when any relocation is needed.
506fn adjust_min_size(
507    extents: &mut Vec<Extent>,
508    holes: &mut Vec<Extent>,
509    min_size: &mut u64,
510) {
511    let mut scratch_space: u64 = 0;
512
513    while let Some(&ext) = extents.first() {
514        if ext.end < *min_size {
515            break;
516        }
517
518        let extent_len = ext.end - ext.start + 1;
519
520        // Find the first hole large enough to hold this extent.
521        let hole_idx = holes.iter().position(|h| {
522            let hole_len = h.end - h.start + 1;
523            hole_len >= extent_len
524        });
525
526        let Some(idx) = hole_idx else {
527            *min_size = ext.end + 1;
528            break;
529        };
530
531        // If the target hole contains a superblock mirror location,
532        // pessimistically assume we need one more extent worth of space.
533        if hole_includes_sb_mirror(
534            holes[idx].start,
535            holes[idx].start + extent_len - 1,
536        ) {
537            *min_size += extent_len;
538        }
539
540        // Shrink or remove the hole.
541        let hole_len = holes[idx].end - holes[idx].start + 1;
542        if hole_len > extent_len {
543            holes[idx].start += extent_len;
544        } else {
545            holes.remove(idx);
546        }
547
548        extents.remove(0);
549
550        if extent_len > scratch_space {
551            scratch_space = extent_len;
552        }
553    }
554
555    if scratch_space > 0 {
556        *min_size += scratch_space;
557        // Chunk allocation may require a new system chunk (up to 32 MiB).
558        *min_size += SZ_32M;
559    }
560}