Skip to main content

btrfs_uapi/
device.rs

1//! # Device management: adding, removing, querying, and extent layout
2//!
3//! Covers adding and removing devices from a mounted filesystem, scanning a
4//! device to register it with the kernel, querying per-device I/O error
5//! statistics, checking whether all devices of a multi-device filesystem
6//! are present and ready, and computing minimum device sizes from the
7//! device extent tree.
8//!
9//! Most operations require `CAP_SYS_ADMIN`.
10
11use crate::{
12    field_size,
13    filesystem::FilesystemInfo,
14    raw::{
15        BTRFS_DEV_EXTENT_KEY, BTRFS_DEV_STATS_RESET, BTRFS_DEV_TREE_OBJECTID,
16        BTRFS_DEVICE_SPEC_BY_ID, btrfs_dev_extent,
17        btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS,
18        btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS,
19        btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS,
20        btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS,
21        btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX,
22        btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS, btrfs_ioc_add_dev,
23        btrfs_ioc_dev_info, btrfs_ioc_devices_ready, btrfs_ioc_forget_dev,
24        btrfs_ioc_get_dev_stats, btrfs_ioc_rm_dev, btrfs_ioc_rm_dev_v2,
25        btrfs_ioc_scan_dev, btrfs_ioctl_dev_info_args,
26        btrfs_ioctl_get_dev_stats, btrfs_ioctl_vol_args,
27        btrfs_ioctl_vol_args_v2,
28    },
29    tree_search::{SearchKey, tree_search},
30    util::read_le_u64,
31};
32use nix::{errno::Errno, libc::c_char};
33use std::{
34    ffi::CStr,
35    fs::OpenOptions,
36    mem,
37    os::{fd::AsRawFd, unix::io::BorrowedFd},
38};
39use uuid::Uuid;
40
41/// Information about a single device within a btrfs filesystem, as returned
42/// by `BTRFS_IOC_DEV_INFO`.
43#[derive(Debug, Clone)]
44pub struct DeviceInfo {
45    /// Device ID.
46    pub devid: u64,
47    /// Device UUID.
48    pub uuid: Uuid,
49    /// Number of bytes used on this device.
50    pub bytes_used: u64,
51    /// Total size of this device in bytes.
52    pub total_bytes: u64,
53    /// Path to the block device, e.g. `/dev/sda`.
54    pub path: String,
55}
56
57/// Specifies a device for operations that can address by either path or ID.
58#[derive(Debug, Clone)]
59pub enum DeviceSpec<'a> {
60    /// A block device path (e.g. `/dev/sdb`), or the special strings
61    /// `"missing"` or `"cancel"` accepted by the remove ioctl.
62    Path(&'a CStr),
63    /// A btrfs device ID as reported by `BTRFS_IOC_DEV_INFO`.
64    Id(u64),
65}
66
67/// Per-device I/O error statistics, as returned by `BTRFS_IOC_GET_DEV_STATS`.
68#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
69pub struct DeviceStats {
70    /// Device ID these stats belong to.
71    pub devid: u64,
72    /// Number of write I/O errors (EIO/EREMOTEIO from lower layers).
73    pub write_errs: u64,
74    /// Number of read I/O errors (EIO/EREMOTEIO from lower layers).
75    pub read_errs: u64,
76    /// Number of flush I/O errors (EIO/EREMOTEIO from lower layers).
77    pub flush_errs: u64,
78    /// Number of checksum or bytenr corruption errors detected on read.
79    pub corruption_errs: u64,
80    /// Number of generation errors (blocks not written where expected).
81    pub generation_errs: u64,
82}
83
84impl DeviceStats {
85    /// Sum of all error counters.
86    #[must_use]
87    pub fn total_errs(&self) -> u64 {
88        self.write_errs
89            + self.read_errs
90            + self.flush_errs
91            + self.corruption_errs
92            + self.generation_errs
93    }
94
95    /// Returns `true` if every counter is zero.
96    #[must_use]
97    pub fn is_clean(&self) -> bool {
98        self.total_errs() == 0
99    }
100}
101
102#[cfg(test)]
103mod tests {
104    use super::*;
105
106    #[test]
107    fn dev_stats_default_is_clean() {
108        let stats = DeviceStats::default();
109        assert!(stats.is_clean());
110        assert_eq!(stats.total_errs(), 0);
111    }
112
113    #[test]
114    fn dev_stats_total_errs() {
115        let stats = DeviceStats {
116            devid: 1,
117            write_errs: 1,
118            read_errs: 2,
119            flush_errs: 3,
120            corruption_errs: 4,
121            generation_errs: 5,
122        };
123        assert_eq!(stats.total_errs(), 15);
124        assert!(!stats.is_clean());
125    }
126
127    #[test]
128    fn dev_stats_single_error_not_clean() {
129        let stats = DeviceStats {
130            corruption_errs: 1,
131            ..DeviceStats::default()
132        };
133        assert!(!stats.is_clean());
134        assert_eq!(stats.total_errs(), 1);
135    }
136}
137
138/// Copy the bytes of `path` (without the nul terminator) into `name`,
139/// returning `ENAMETOOLONG` if the path (including the terminator that the
140/// kernel expects to already be present via zeroing) does not fit.
141fn copy_path_to_name(name: &mut [c_char], path: &CStr) -> nix::Result<()> {
142    let bytes = path.to_bytes(); // excludes nul terminator
143    if bytes.len() >= name.len() {
144        return Err(Errno::ENAMETOOLONG);
145    }
146    for (i, &b) in bytes.iter().enumerate() {
147        name[i] = b as c_char;
148    }
149    // The remainder of `name` is already zeroed by the caller (mem::zeroed).
150    Ok(())
151}
152
153/// Open `/dev/btrfs-control` for read+write, mapping any `std::io::Error` to
154/// the appropriate `nix::errno::Errno`.
155fn open_control() -> nix::Result<std::fs::File> {
156    OpenOptions::new()
157        .read(true)
158        .write(true)
159        .open("/dev/btrfs-control")
160        .map_err(|e| {
161            Errno::from_raw(e.raw_os_error().unwrap_or(nix::libc::ENODEV))
162        })
163}
164
165/// Query information about the device with the given `devid` on the filesystem
166/// referred to by `fd`.
167///
168/// Returns `None` if no device with that ID exists (`ENODEV`).
169pub fn device_info(
170    fd: BorrowedFd,
171    devid: u64,
172) -> nix::Result<Option<DeviceInfo>> {
173    let mut raw: btrfs_ioctl_dev_info_args = unsafe { mem::zeroed() };
174    raw.devid = devid;
175
176    match unsafe { btrfs_ioc_dev_info(fd.as_raw_fd(), &raw mut raw) } {
177        Err(Errno::ENODEV) => return Ok(None),
178        Err(e) => return Err(e),
179        Ok(_) => {}
180    }
181
182    let path = unsafe { CStr::from_ptr(raw.path.as_ptr().cast()) }
183        .to_string_lossy()
184        .into_owned();
185
186    Ok(Some(DeviceInfo {
187        devid: raw.devid,
188        uuid: Uuid::from_bytes(raw.uuid),
189        bytes_used: raw.bytes_used,
190        total_bytes: raw.total_bytes,
191        path,
192    }))
193}
194
195/// Query information about all devices in the filesystem referred to by `fd`,
196/// using the device count from a previously obtained [`FilesystemInfo`].
197///
198/// Iterates devids `1..=max_id`, skipping any that return `ENODEV` (holes in
199/// the devid space are normal when devices have been removed).
200pub fn device_info_all(
201    fd: BorrowedFd,
202    fs_info: &FilesystemInfo,
203) -> nix::Result<Vec<DeviceInfo>> {
204    let mut devices = Vec::with_capacity(fs_info.num_devices as usize);
205    for devid in 1..=fs_info.max_id {
206        if let Some(info) = device_info(fd, devid)? {
207            devices.push(info);
208        }
209    }
210    Ok(devices)
211}
212
213/// Add a device to the btrfs filesystem referred to by `fd`.
214///
215/// `path` must be the path to an unmounted block device. The kernel requires
216/// `CAP_SYS_ADMIN`.
217pub fn device_add(fd: BorrowedFd, path: &CStr) -> nix::Result<()> {
218    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
219    copy_path_to_name(&mut raw.name, path)?;
220    unsafe { btrfs_ioc_add_dev(fd.as_raw_fd(), &raw const raw) }?;
221    Ok(())
222}
223
224/// Remove a device from the btrfs filesystem referred to by `fd`.
225///
226/// The device can be specified either by path or by its btrfs device ID via
227/// [`DeviceSpec`]. Uses `BTRFS_IOC_RM_DEV_V2` and falls back to the older
228/// `BTRFS_IOC_RM_DEV` ioctl on kernels that do not support the v2 variant
229/// (only possible when removing by path). The kernel requires `CAP_SYS_ADMIN`.
230///
231/// Errors: ENOTTY or EOPNOTSUPP from `RM_DEV_V2` triggers an automatic
232/// fallback to the v1 ioctl (path-based removal only; by-ID removal
233/// requires v2 and will propagate the error).  EBUSY if the device holds
234/// the only copy of some data and cannot be removed.
235pub fn device_remove(fd: BorrowedFd, spec: DeviceSpec) -> nix::Result<()> {
236    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
237
238    match spec {
239        DeviceSpec::Id(devid) => {
240            args.flags = u64::from(BTRFS_DEVICE_SPEC_BY_ID);
241            // SAFETY: devid is the active union member when BTRFS_DEVICE_SPEC_BY_ID is set.
242            args.__bindgen_anon_2.devid = devid;
243            unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &raw const args) }?;
244        }
245        DeviceSpec::Path(path) => {
246            // SAFETY: name is the active union member when flags == 0.
247            unsafe {
248                copy_path_to_name(&mut args.__bindgen_anon_2.name, path)
249            }?;
250            match unsafe {
251                btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &raw const args)
252            } {
253                Ok(_) => {}
254                // Fall back to the old single-arg ioctl on kernels that either
255                // don't know about v2 (ENOTTY) or don't recognise our flags (EOPNOTSUPP).
256                Err(Errno::ENOTTY | Errno::EOPNOTSUPP) => {
257                    let mut old: btrfs_ioctl_vol_args =
258                        unsafe { mem::zeroed() };
259                    copy_path_to_name(&mut old.name, path)?;
260                    unsafe {
261                        btrfs_ioc_rm_dev(fd.as_raw_fd(), &raw const old)
262                    }?;
263                }
264                Err(e) => return Err(e),
265            }
266        }
267    }
268
269    Ok(())
270}
271
272/// Register a block device with the kernel's btrfs device scanner so that
273/// multi-device filesystems containing it can be mounted.
274///
275/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_SCAN_DEV`. `path` must
276/// be the path to a block device that contains a btrfs filesystem member.
277pub fn device_scan(path: &CStr) -> nix::Result<()> {
278    let ctl = open_control()?;
279    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
280    copy_path_to_name(&mut raw.name, path)?;
281    unsafe { btrfs_ioc_scan_dev(ctl.as_raw_fd(), &raw const raw) }?;
282    Ok(())
283}
284
285/// Unregister a device (or all stale devices) from the kernel's btrfs device
286/// scanner.
287///
288/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_FORGET_DEV`. If `path`
289/// is `None`, all devices that are not part of a currently mounted filesystem
290/// are unregistered. If `path` is `Some`, only that specific device path is
291/// unregistered.
292pub fn device_forget(path: Option<&CStr>) -> nix::Result<()> {
293    let ctl = open_control()?;
294    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
295    if let Some(p) = path {
296        copy_path_to_name(&mut raw.name, p)?;
297    }
298    unsafe { btrfs_ioc_forget_dev(ctl.as_raw_fd(), &raw const raw) }?;
299    Ok(())
300}
301
302/// Check whether all member devices of the filesystem that contains `path`
303/// are available and the filesystem is ready to mount.
304///
305/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_DEVICES_READY`. `path`
306/// must be the path to one of the block devices belonging to the filesystem.
307/// Returns `Ok(())` when all devices are present; returns an error (typically
308/// `ENOENT` or `ENXIO`) if the set is incomplete.
309pub fn device_ready(path: &CStr) -> nix::Result<()> {
310    let ctl = open_control()?;
311    // BTRFS_IOC_DEVICES_READY is declared _IOR but the kernel reads the device
312    // path from args.name, so we pass a mut pointer as ioctl_read! requires.
313    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
314    copy_path_to_name(&mut raw.name, path)?;
315    unsafe { btrfs_ioc_devices_ready(ctl.as_raw_fd(), &raw mut raw) }?;
316    Ok(())
317}
318
319/// Query I/O error statistics for the device identified by `devid` within the
320/// filesystem referred to by `fd`.
321///
322/// If `reset` is `true`, the kernel atomically returns the current values and
323/// then resets all counters to zero. The kernel requires `CAP_SYS_ADMIN`.
324pub fn device_stats(
325    fd: BorrowedFd,
326    devid: u64,
327    reset: bool,
328) -> nix::Result<DeviceStats> {
329    let mut raw: btrfs_ioctl_get_dev_stats = unsafe { mem::zeroed() };
330    raw.devid = devid;
331    raw.nr_items = u64::from(btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX);
332    if reset {
333        raw.flags = u64::from(BTRFS_DEV_STATS_RESET);
334    }
335
336    unsafe { btrfs_ioc_get_dev_stats(fd.as_raw_fd(), &raw mut raw) }?;
337
338    Ok(DeviceStats {
339        devid,
340        write_errs: raw.values
341            [btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS as usize],
342        read_errs: raw.values
343            [btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS as usize],
344        flush_errs: raw.values
345            [btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS as usize],
346        corruption_errs: raw.values
347            [btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS as usize],
348        generation_errs: raw.values
349            [btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS as usize],
350    })
351}
352
353const DEV_EXTENT_LENGTH_OFF: usize =
354    std::mem::offset_of!(btrfs_dev_extent, length);
355
356const SZ_1M: u64 = 1024 * 1024;
357const SZ_32M: u64 = 32 * 1024 * 1024;
358
359/// Number of superblock mirror copies btrfs maintains.
360const BTRFS_SUPER_MIRROR_MAX: usize = 3;
361
362/// Return the byte offset of superblock mirror `i`.
363///
364/// Mirror 0 is at 64 KiB, mirror 1 at 64 MiB, mirror 2 at 256 GiB.
365fn sb_offset(i: usize) -> u64 {
366    match i {
367        0 => 64 * 1024,
368        _ => 1u64 << (20 + 10 * (i as u64)),
369    }
370}
371
372/// A contiguous physical byte range on a device (inclusive end).
373#[derive(Debug, Clone, Copy)]
374struct Extent {
375    start: u64,
376    /// Inclusive end byte.
377    end: u64,
378}
379
380/// Compute the minimum size to which device `devid` can be shrunk.
381///
382/// Walks the device tree for all `DEV_EXTENT_KEY` items belonging to
383/// `devid`, sums their lengths, then adjusts for extents that sit beyond
384/// the sum by checking whether they can be relocated into holes closer to
385/// the start of the device. The algorithm matches `btrfs inspect-internal
386/// min-dev-size` from btrfs-progs.
387///
388/// Requires `CAP_SYS_ADMIN`.
389pub fn device_min_size(fd: BorrowedFd, devid: u64) -> nix::Result<u64> {
390    let mut min_size: u64 = SZ_1M;
391    let mut extents: Vec<Extent> = Vec::new();
392    let mut holes: Vec<Extent> = Vec::new();
393    let mut last_pos: Option<u64> = None;
394
395    tree_search(
396        fd,
397        SearchKey::for_objectid_range(
398            u64::from(BTRFS_DEV_TREE_OBJECTID),
399            BTRFS_DEV_EXTENT_KEY,
400            devid,
401            devid,
402        ),
403        |hdr, data| {
404            if data.len()
405                < DEV_EXTENT_LENGTH_OFF + field_size!(btrfs_dev_extent, length)
406            {
407                return Ok(());
408            }
409            let phys_start = hdr.offset;
410            let len = read_le_u64(data, DEV_EXTENT_LENGTH_OFF);
411
412            min_size += len;
413
414            // Extents are prepended (descending end offset) so that the
415            // adjustment pass processes the highest-addressed extent first.
416            extents.push(Extent {
417                start: phys_start,
418                end: phys_start + len - 1,
419            });
420
421            if let Some(prev_end) = last_pos
422                && prev_end != phys_start
423            {
424                holes.push(Extent {
425                    start: prev_end,
426                    end: phys_start - 1,
427                });
428            }
429
430            last_pos = Some(phys_start + len);
431            Ok(())
432        },
433    )?;
434
435    // Sort extents by descending end offset for the adjustment pass.
436    extents.sort_by(|a, b| b.end.cmp(&a.end));
437
438    adjust_min_size(&mut extents, &mut holes, &mut min_size);
439
440    Ok(min_size)
441}
442
443/// Check whether a byte range `[start, end]` contains a superblock mirror.
444fn hole_includes_sb_mirror(start: u64, end: u64) -> bool {
445    (0..BTRFS_SUPER_MIRROR_MAX).any(|i| {
446        let bytenr = sb_offset(i);
447        bytenr >= start && bytenr <= end
448    })
449}
450
451/// Adjust `min_size` downward by relocating tail extents into holes.
452///
453/// Processes extents in descending order of end offset. If an extent sits
454/// beyond the current `min_size`, try to find a hole large enough to
455/// relocate it. If no hole fits, the device cannot be shrunk past that
456/// extent and `min_size` is set to its end + 1.
457///
458/// Adds scratch space (largest relocated extent + 32 MiB for a potential
459/// system chunk allocation) when any relocation is needed.
460fn adjust_min_size(
461    extents: &mut Vec<Extent>,
462    holes: &mut Vec<Extent>,
463    min_size: &mut u64,
464) {
465    let mut scratch_space: u64 = 0;
466
467    while let Some(&ext) = extents.first() {
468        if ext.end < *min_size {
469            break;
470        }
471
472        let extent_len = ext.end - ext.start + 1;
473
474        // Find the first hole large enough to hold this extent.
475        let hole_idx = holes.iter().position(|h| {
476            let hole_len = h.end - h.start + 1;
477            hole_len >= extent_len
478        });
479
480        let Some(idx) = hole_idx else {
481            *min_size = ext.end + 1;
482            break;
483        };
484
485        // If the target hole contains a superblock mirror location,
486        // pessimistically assume we need one more extent worth of space.
487        if hole_includes_sb_mirror(
488            holes[idx].start,
489            holes[idx].start + extent_len - 1,
490        ) {
491            *min_size += extent_len;
492        }
493
494        // Shrink or remove the hole.
495        let hole_len = holes[idx].end - holes[idx].start + 1;
496        if hole_len > extent_len {
497            holes[idx].start += extent_len;
498        } else {
499            holes.remove(idx);
500        }
501
502        extents.remove(0);
503
504        if extent_len > scratch_space {
505            scratch_space = extent_len;
506        }
507    }
508
509    if scratch_space > 0 {
510        *min_size += scratch_space;
511        // Chunk allocation may require a new system chunk (up to 32 MiB).
512        *min_size += SZ_32M;
513    }
514}