Skip to main content

btrfs_uapi/
device.rs

1//! # Device management: adding, removing, querying, and extent layout
2//!
3//! Covers adding and removing devices from a mounted filesystem, scanning a
4//! device to register it with the kernel, querying per-device I/O error
5//! statistics, checking whether all devices of a multi-device filesystem
6//! are present and ready, and computing minimum device sizes from the
7//! device extent tree.
8//!
9//! Most operations require `CAP_SYS_ADMIN`.
10
11use crate::{
12    field_size,
13    filesystem::FilesystemInfo,
14    raw::{
15        BTRFS_DEV_EXTENT_KEY, BTRFS_DEV_STATS_RESET, BTRFS_DEV_TREE_OBJECTID,
16        BTRFS_DEVICE_SPEC_BY_ID, btrfs_dev_extent,
17        btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS,
18        btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS,
19        btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS,
20        btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS,
21        btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX,
22        btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS, btrfs_ioc_add_dev,
23        btrfs_ioc_dev_info, btrfs_ioc_devices_ready, btrfs_ioc_forget_dev,
24        btrfs_ioc_get_dev_stats, btrfs_ioc_rm_dev, btrfs_ioc_rm_dev_v2,
25        btrfs_ioc_scan_dev, btrfs_ioctl_dev_info_args,
26        btrfs_ioctl_get_dev_stats, btrfs_ioctl_vol_args,
27        btrfs_ioctl_vol_args_v2,
28    },
29    tree_search::{SearchKey, tree_search},
30    util::read_le_u64,
31};
32use nix::{errno::Errno, libc::c_char};
33use std::{
34    ffi::CStr,
35    fs::OpenOptions,
36    mem,
37    os::{fd::AsRawFd, unix::io::BorrowedFd},
38};
39use uuid::Uuid;
40
41/// Information about a single device within a btrfs filesystem, as returned
42/// by `BTRFS_IOC_DEV_INFO`.
43#[derive(Debug, Clone)]
44pub struct DeviceInfo {
45    /// Device ID.
46    pub devid: u64,
47    /// Device UUID.
48    pub uuid: Uuid,
49    /// Number of bytes used on this device.
50    pub bytes_used: u64,
51    /// Total size of this device in bytes.
52    pub total_bytes: u64,
53    /// Path to the block device, e.g. `/dev/sda`.
54    pub path: String,
55}
56
57/// Specifies a device for operations that can address by either path or ID.
58#[derive(Debug, Clone)]
59pub enum DeviceSpec<'a> {
60    /// A block device path (e.g. `/dev/sdb`), or the special strings
61    /// `"missing"` or `"cancel"` accepted by the remove ioctl.
62    Path(&'a CStr),
63    /// A btrfs device ID as reported by `BTRFS_IOC_DEV_INFO`.
64    Id(u64),
65}
66
67/// Per-device I/O error statistics, as returned by `BTRFS_IOC_GET_DEV_STATS`.
68#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
69pub struct DeviceStats {
70    /// Device ID these stats belong to.
71    pub devid: u64,
72    /// Number of write I/O errors (EIO/EREMOTEIO from lower layers).
73    pub write_errs: u64,
74    /// Number of read I/O errors (EIO/EREMOTEIO from lower layers).
75    pub read_errs: u64,
76    /// Number of flush I/O errors (EIO/EREMOTEIO from lower layers).
77    pub flush_errs: u64,
78    /// Number of checksum or bytenr corruption errors detected on read.
79    pub corruption_errs: u64,
80    /// Number of generation errors (blocks not written where expected).
81    pub generation_errs: u64,
82}
83
84impl DeviceStats {
85    /// Sum of all error counters.
86    pub fn total_errs(&self) -> u64 {
87        self.write_errs
88            + self.read_errs
89            + self.flush_errs
90            + self.corruption_errs
91            + self.generation_errs
92    }
93
94    /// Returns `true` if every counter is zero.
95    pub fn is_clean(&self) -> bool {
96        self.total_errs() == 0
97    }
98}
99
100#[cfg(test)]
101mod tests {
102    use super::*;
103
104    #[test]
105    fn dev_stats_default_is_clean() {
106        let stats = DeviceStats::default();
107        assert!(stats.is_clean());
108        assert_eq!(stats.total_errs(), 0);
109    }
110
111    #[test]
112    fn dev_stats_total_errs() {
113        let stats = DeviceStats {
114            devid: 1,
115            write_errs: 1,
116            read_errs: 2,
117            flush_errs: 3,
118            corruption_errs: 4,
119            generation_errs: 5,
120        };
121        assert_eq!(stats.total_errs(), 15);
122        assert!(!stats.is_clean());
123    }
124
125    #[test]
126    fn dev_stats_single_error_not_clean() {
127        let stats = DeviceStats {
128            corruption_errs: 1,
129            ..DeviceStats::default()
130        };
131        assert!(!stats.is_clean());
132        assert_eq!(stats.total_errs(), 1);
133    }
134}
135
136/// Copy the bytes of `path` (without the nul terminator) into `name`,
137/// returning `ENAMETOOLONG` if the path (including the terminator that the
138/// kernel expects to already be present via zeroing) does not fit.
139fn copy_path_to_name(name: &mut [c_char], path: &CStr) -> nix::Result<()> {
140    let bytes = path.to_bytes(); // excludes nul terminator
141    if bytes.len() >= name.len() {
142        return Err(Errno::ENAMETOOLONG);
143    }
144    for (i, &b) in bytes.iter().enumerate() {
145        name[i] = b as c_char;
146    }
147    // The remainder of `name` is already zeroed by the caller (mem::zeroed).
148    Ok(())
149}
150
151/// Open `/dev/btrfs-control` for read+write, mapping any `std::io::Error` to
152/// the appropriate `nix::errno::Errno`.
153fn open_control() -> nix::Result<std::fs::File> {
154    OpenOptions::new()
155        .read(true)
156        .write(true)
157        .open("/dev/btrfs-control")
158        .map_err(|e| {
159            Errno::from_raw(e.raw_os_error().unwrap_or(nix::libc::ENODEV))
160        })
161}
162
163/// Query information about the device with the given `devid` on the filesystem
164/// referred to by `fd`.
165///
166/// Returns `None` if no device with that ID exists (`ENODEV`).
167pub fn device_info(
168    fd: BorrowedFd,
169    devid: u64,
170) -> nix::Result<Option<DeviceInfo>> {
171    let mut raw: btrfs_ioctl_dev_info_args = unsafe { mem::zeroed() };
172    raw.devid = devid;
173
174    match unsafe { btrfs_ioc_dev_info(fd.as_raw_fd(), &mut raw) } {
175        Err(Errno::ENODEV) => return Ok(None),
176        Err(e) => return Err(e),
177        Ok(_) => {}
178    }
179
180    let path = unsafe { CStr::from_ptr(raw.path.as_ptr() as *const _) }
181        .to_string_lossy()
182        .into_owned();
183
184    Ok(Some(DeviceInfo {
185        devid: raw.devid,
186        uuid: Uuid::from_bytes(raw.uuid),
187        bytes_used: raw.bytes_used,
188        total_bytes: raw.total_bytes,
189        path,
190    }))
191}
192
193/// Query information about all devices in the filesystem referred to by `fd`,
194/// using the device count from a previously obtained [`FilesystemInfo`].
195///
196/// Iterates devids `1..=max_id`, skipping any that return `ENODEV` (holes in
197/// the devid space are normal when devices have been removed).
198pub fn device_info_all(
199    fd: BorrowedFd,
200    fs_info: &FilesystemInfo,
201) -> nix::Result<Vec<DeviceInfo>> {
202    let mut devices = Vec::with_capacity(fs_info.num_devices as usize);
203    for devid in 1..=fs_info.max_id {
204        if let Some(info) = device_info(fd, devid)? {
205            devices.push(info);
206        }
207    }
208    Ok(devices)
209}
210
211/// Add a device to the btrfs filesystem referred to by `fd`.
212///
213/// `path` must be the path to an unmounted block device. The kernel requires
214/// `CAP_SYS_ADMIN`.
215pub fn device_add(fd: BorrowedFd, path: &CStr) -> nix::Result<()> {
216    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
217    copy_path_to_name(&mut raw.name, path)?;
218    unsafe { btrfs_ioc_add_dev(fd.as_raw_fd(), &raw) }?;
219    Ok(())
220}
221
222/// Remove a device from the btrfs filesystem referred to by `fd`.
223///
224/// The device can be specified either by path or by its btrfs device ID via
225/// [`DeviceSpec`]. Uses `BTRFS_IOC_RM_DEV_V2` and falls back to the older
226/// `BTRFS_IOC_RM_DEV` ioctl on kernels that do not support the v2 variant
227/// (only possible when removing by path). The kernel requires `CAP_SYS_ADMIN`.
228///
229/// Errors: ENOTTY or EOPNOTSUPP from `RM_DEV_V2` triggers an automatic
230/// fallback to the v1 ioctl (path-based removal only; by-ID removal
231/// requires v2 and will propagate the error).  EBUSY if the device holds
232/// the only copy of some data and cannot be removed.
233pub fn device_remove(fd: BorrowedFd, spec: DeviceSpec) -> nix::Result<()> {
234    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
235
236    match spec {
237        DeviceSpec::Id(devid) => {
238            args.flags = BTRFS_DEVICE_SPEC_BY_ID as u64;
239            // SAFETY: devid is the active union member when BTRFS_DEVICE_SPEC_BY_ID is set.
240            args.__bindgen_anon_2.devid = devid;
241            unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &args) }?;
242        }
243        DeviceSpec::Path(path) => {
244            // SAFETY: name is the active union member when flags == 0.
245            unsafe {
246                copy_path_to_name(&mut args.__bindgen_anon_2.name, path)
247            }?;
248            match unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &args) } {
249                Ok(_) => {}
250                // Fall back to the old single-arg ioctl on kernels that either
251                // don't know about v2 (ENOTTY) or don't recognise our flags (EOPNOTSUPP).
252                Err(Errno::ENOTTY) | Err(Errno::EOPNOTSUPP) => {
253                    let mut old: btrfs_ioctl_vol_args =
254                        unsafe { mem::zeroed() };
255                    copy_path_to_name(&mut old.name, path)?;
256                    unsafe { btrfs_ioc_rm_dev(fd.as_raw_fd(), &old) }?;
257                }
258                Err(e) => return Err(e),
259            }
260        }
261    }
262
263    Ok(())
264}
265
266/// Register a block device with the kernel's btrfs device scanner so that
267/// multi-device filesystems containing it can be mounted.
268///
269/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_SCAN_DEV`. `path` must
270/// be the path to a block device that contains a btrfs filesystem member.
271pub fn device_scan(path: &CStr) -> nix::Result<()> {
272    let ctl = open_control()?;
273    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
274    copy_path_to_name(&mut raw.name, path)?;
275    unsafe { btrfs_ioc_scan_dev(ctl.as_raw_fd(), &raw) }?;
276    Ok(())
277}
278
279/// Unregister a device (or all stale devices) from the kernel's btrfs device
280/// scanner.
281///
282/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_FORGET_DEV`. If `path`
283/// is `None`, all devices that are not part of a currently mounted filesystem
284/// are unregistered. If `path` is `Some`, only that specific device path is
285/// unregistered.
286pub fn device_forget(path: Option<&CStr>) -> nix::Result<()> {
287    let ctl = open_control()?;
288    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
289    if let Some(p) = path {
290        copy_path_to_name(&mut raw.name, p)?;
291    }
292    unsafe { btrfs_ioc_forget_dev(ctl.as_raw_fd(), &raw) }?;
293    Ok(())
294}
295
296/// Check whether all member devices of the filesystem that contains `path`
297/// are available and the filesystem is ready to mount.
298///
299/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_DEVICES_READY`. `path`
300/// must be the path to one of the block devices belonging to the filesystem.
301/// Returns `Ok(())` when all devices are present; returns an error (typically
302/// `ENOENT` or `ENXIO`) if the set is incomplete.
303pub fn device_ready(path: &CStr) -> nix::Result<()> {
304    let ctl = open_control()?;
305    // BTRFS_IOC_DEVICES_READY is declared _IOR but the kernel reads the device
306    // path from args.name, so we pass a mut pointer as ioctl_read! requires.
307    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
308    copy_path_to_name(&mut raw.name, path)?;
309    unsafe { btrfs_ioc_devices_ready(ctl.as_raw_fd(), &mut raw) }?;
310    Ok(())
311}
312
313/// Query I/O error statistics for the device identified by `devid` within the
314/// filesystem referred to by `fd`.
315///
316/// If `reset` is `true`, the kernel atomically returns the current values and
317/// then resets all counters to zero. The kernel requires `CAP_SYS_ADMIN`.
318pub fn device_stats(
319    fd: BorrowedFd,
320    devid: u64,
321    reset: bool,
322) -> nix::Result<DeviceStats> {
323    let mut raw: btrfs_ioctl_get_dev_stats = unsafe { mem::zeroed() };
324    raw.devid = devid;
325    raw.nr_items = btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX as u64;
326    if reset {
327        raw.flags = BTRFS_DEV_STATS_RESET as u64;
328    }
329
330    unsafe { btrfs_ioc_get_dev_stats(fd.as_raw_fd(), &mut raw) }?;
331
332    Ok(DeviceStats {
333        devid,
334        write_errs: raw.values
335            [btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS as usize],
336        read_errs: raw.values
337            [btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS as usize],
338        flush_errs: raw.values
339            [btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS as usize],
340        corruption_errs: raw.values
341            [btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS as usize],
342        generation_errs: raw.values
343            [btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS as usize],
344    })
345}
346
347const DEV_EXTENT_LENGTH_OFF: usize =
348    std::mem::offset_of!(btrfs_dev_extent, length);
349
350const SZ_1M: u64 = 1024 * 1024;
351const SZ_32M: u64 = 32 * 1024 * 1024;
352
353/// Number of superblock mirror copies btrfs maintains.
354const BTRFS_SUPER_MIRROR_MAX: usize = 3;
355
356/// Return the byte offset of superblock mirror `i`.
357///
358/// Mirror 0 is at 64 KiB, mirror 1 at 64 MiB, mirror 2 at 256 GiB.
359fn sb_offset(i: usize) -> u64 {
360    match i {
361        0 => 64 * 1024,
362        _ => 1u64 << (20 + 10 * (i as u64)),
363    }
364}
365
366/// A contiguous physical byte range on a device (inclusive end).
367#[derive(Debug, Clone, Copy)]
368struct Extent {
369    start: u64,
370    /// Inclusive end byte.
371    end: u64,
372}
373
374/// Compute the minimum size to which device `devid` can be shrunk.
375///
376/// Walks the device tree for all `DEV_EXTENT_KEY` items belonging to
377/// `devid`, sums their lengths, then adjusts for extents that sit beyond
378/// the sum by checking whether they can be relocated into holes closer to
379/// the start of the device. The algorithm matches `btrfs inspect-internal
380/// min-dev-size` from btrfs-progs.
381///
382/// Requires `CAP_SYS_ADMIN`.
383pub fn device_min_size(fd: BorrowedFd, devid: u64) -> nix::Result<u64> {
384    let mut min_size: u64 = SZ_1M;
385    let mut extents: Vec<Extent> = Vec::new();
386    let mut holes: Vec<Extent> = Vec::new();
387    let mut last_pos: Option<u64> = None;
388
389    tree_search(
390        fd,
391        SearchKey::for_objectid_range(
392            BTRFS_DEV_TREE_OBJECTID as u64,
393            BTRFS_DEV_EXTENT_KEY,
394            devid,
395            devid,
396        ),
397        |hdr, data| {
398            if data.len()
399                < DEV_EXTENT_LENGTH_OFF + field_size!(btrfs_dev_extent, length)
400            {
401                return Ok(());
402            }
403            let phys_start = hdr.offset;
404            let len = read_le_u64(data, DEV_EXTENT_LENGTH_OFF);
405
406            min_size += len;
407
408            // Extents are prepended (descending end offset) so that the
409            // adjustment pass processes the highest-addressed extent first.
410            extents.push(Extent {
411                start: phys_start,
412                end: phys_start + len - 1,
413            });
414
415            if let Some(prev_end) = last_pos
416                && prev_end != phys_start
417            {
418                holes.push(Extent {
419                    start: prev_end,
420                    end: phys_start - 1,
421                });
422            }
423
424            last_pos = Some(phys_start + len);
425            Ok(())
426        },
427    )?;
428
429    // Sort extents by descending end offset for the adjustment pass.
430    extents.sort_by(|a, b| b.end.cmp(&a.end));
431
432    adjust_min_size(&mut extents, &mut holes, &mut min_size);
433
434    Ok(min_size)
435}
436
437/// Check whether a byte range `[start, end]` contains a superblock mirror.
438fn hole_includes_sb_mirror(start: u64, end: u64) -> bool {
439    (0..BTRFS_SUPER_MIRROR_MAX).any(|i| {
440        let bytenr = sb_offset(i);
441        bytenr >= start && bytenr <= end
442    })
443}
444
445/// Adjust `min_size` downward by relocating tail extents into holes.
446///
447/// Processes extents in descending order of end offset. If an extent sits
448/// beyond the current `min_size`, try to find a hole large enough to
449/// relocate it. If no hole fits, the device cannot be shrunk past that
450/// extent and `min_size` is set to its end + 1.
451///
452/// Adds scratch space (largest relocated extent + 32 MiB for a potential
453/// system chunk allocation) when any relocation is needed.
454fn adjust_min_size(
455    extents: &mut Vec<Extent>,
456    holes: &mut Vec<Extent>,
457    min_size: &mut u64,
458) {
459    let mut scratch_space: u64 = 0;
460
461    while let Some(&ext) = extents.first() {
462        if ext.end < *min_size {
463            break;
464        }
465
466        let extent_len = ext.end - ext.start + 1;
467
468        // Find the first hole large enough to hold this extent.
469        let hole_idx = holes.iter().position(|h| {
470            let hole_len = h.end - h.start + 1;
471            hole_len >= extent_len
472        });
473
474        let Some(idx) = hole_idx else {
475            *min_size = ext.end + 1;
476            break;
477        };
478
479        // If the target hole contains a superblock mirror location,
480        // pessimistically assume we need one more extent worth of space.
481        if hole_includes_sb_mirror(
482            holes[idx].start,
483            holes[idx].start + extent_len - 1,
484        ) {
485            *min_size += extent_len;
486        }
487
488        // Shrink or remove the hole.
489        let hole_len = holes[idx].end - holes[idx].start + 1;
490        if hole_len > extent_len {
491            holes[idx].start += extent_len;
492        } else {
493            holes.remove(idx);
494        }
495
496        extents.remove(0);
497
498        if extent_len > scratch_space {
499            scratch_space = extent_len;
500        }
501    }
502
503    if scratch_space > 0 {
504        *min_size += scratch_space;
505        // Chunk allocation may require a new system chunk (up to 32 MiB).
506        *min_size += SZ_32M;
507    }
508}