Skip to main content

btrfs_uapi/
device.rs

1//! # Device management: adding, removing, querying, and extent layout
2//!
3//! Covers adding and removing devices from a mounted filesystem, scanning a
4//! device to register it with the kernel, querying per-device I/O error
5//! statistics, checking whether all devices of a multi-device filesystem
6//! are present and ready, and computing minimum device sizes from the
7//! device extent tree.
8//!
9//! Most operations require `CAP_SYS_ADMIN`.
10
11use crate::{
12    field_size,
13    filesystem::FilesystemInfo,
14    raw::{
15        BTRFS_DEV_EXTENT_KEY, BTRFS_DEV_STATS_RESET, BTRFS_DEV_TREE_OBJECTID,
16        BTRFS_DEVICE_SPEC_BY_ID, btrfs_dev_extent,
17        btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS,
18        btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS,
19        btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS,
20        btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS,
21        btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX,
22        btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS, btrfs_ioc_add_dev,
23        btrfs_ioc_dev_info, btrfs_ioc_devices_ready, btrfs_ioc_forget_dev,
24        btrfs_ioc_get_dev_stats, btrfs_ioc_rm_dev, btrfs_ioc_rm_dev_v2,
25        btrfs_ioc_scan_dev, btrfs_ioctl_dev_info_args,
26        btrfs_ioctl_get_dev_stats, btrfs_ioctl_vol_args,
27        btrfs_ioctl_vol_args_v2,
28    },
29    tree_search::{SearchKey, tree_search},
30};
31use nix::{errno::Errno, libc::c_char};
32use std::{
33    ffi::CStr,
34    fs::OpenOptions,
35    mem,
36    os::{fd::AsRawFd, unix::io::BorrowedFd},
37};
38use uuid::Uuid;
39
40/// Information about a single device within a btrfs filesystem, as returned
41/// by `BTRFS_IOC_DEV_INFO`.
42#[derive(Debug, Clone)]
43pub struct DeviceInfo {
44    /// Device ID.
45    pub devid: u64,
46    /// Device UUID.
47    pub uuid: Uuid,
48    /// Number of bytes used on this device.
49    pub bytes_used: u64,
50    /// Total size of this device in bytes.
51    pub total_bytes: u64,
52    /// Path to the block device, e.g. `/dev/sda`.
53    pub path: String,
54}
55
56/// Specifies a device for operations that can address by either path or ID.
57#[derive(Debug, Clone)]
58pub enum DeviceSpec<'a> {
59    /// A block device path (e.g. `/dev/sdb`), or the special strings
60    /// `"missing"` or `"cancel"` accepted by the remove ioctl.
61    Path(&'a CStr),
62    /// A btrfs device ID as reported by `BTRFS_IOC_DEV_INFO`.
63    Id(u64),
64}
65
66/// Per-device I/O error statistics, as returned by `BTRFS_IOC_GET_DEV_STATS`.
67#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
68pub struct DeviceStats {
69    /// Device ID these stats belong to.
70    pub devid: u64,
71    /// Number of write I/O errors (EIO/EREMOTEIO from lower layers).
72    pub write_errs: u64,
73    /// Number of read I/O errors (EIO/EREMOTEIO from lower layers).
74    pub read_errs: u64,
75    /// Number of flush I/O errors (EIO/EREMOTEIO from lower layers).
76    pub flush_errs: u64,
77    /// Number of checksum or bytenr corruption errors detected on read.
78    pub corruption_errs: u64,
79    /// Number of generation errors (blocks not written where expected).
80    pub generation_errs: u64,
81}
82
83impl DeviceStats {
84    /// Sum of all error counters.
85    pub fn total_errs(&self) -> u64 {
86        self.write_errs
87            + self.read_errs
88            + self.flush_errs
89            + self.corruption_errs
90            + self.generation_errs
91    }
92
93    /// Returns `true` if every counter is zero.
94    pub fn is_clean(&self) -> bool {
95        self.total_errs() == 0
96    }
97}
98
99#[cfg(test)]
100mod tests {
101    use super::*;
102
103    #[test]
104    fn dev_stats_default_is_clean() {
105        let stats = DeviceStats::default();
106        assert!(stats.is_clean());
107        assert_eq!(stats.total_errs(), 0);
108    }
109
110    #[test]
111    fn dev_stats_total_errs() {
112        let stats = DeviceStats {
113            devid: 1,
114            write_errs: 1,
115            read_errs: 2,
116            flush_errs: 3,
117            corruption_errs: 4,
118            generation_errs: 5,
119        };
120        assert_eq!(stats.total_errs(), 15);
121        assert!(!stats.is_clean());
122    }
123
124    #[test]
125    fn dev_stats_single_error_not_clean() {
126        let stats = DeviceStats {
127            corruption_errs: 1,
128            ..DeviceStats::default()
129        };
130        assert!(!stats.is_clean());
131        assert_eq!(stats.total_errs(), 1);
132    }
133}
134
135/// Copy the bytes of `path` (without the nul terminator) into `name`,
136/// returning `ENAMETOOLONG` if the path (including the terminator that the
137/// kernel expects to already be present via zeroing) does not fit.
138fn copy_path_to_name(name: &mut [c_char], path: &CStr) -> nix::Result<()> {
139    let bytes = path.to_bytes(); // excludes nul terminator
140    if bytes.len() >= name.len() {
141        return Err(Errno::ENAMETOOLONG);
142    }
143    for (i, &b) in bytes.iter().enumerate() {
144        name[i] = b as c_char;
145    }
146    // The remainder of `name` is already zeroed by the caller (mem::zeroed).
147    Ok(())
148}
149
150/// Open `/dev/btrfs-control` for read+write, mapping any `std::io::Error` to
151/// the appropriate `nix::errno::Errno`.
152fn open_control() -> nix::Result<std::fs::File> {
153    OpenOptions::new()
154        .read(true)
155        .write(true)
156        .open("/dev/btrfs-control")
157        .map_err(|e| {
158            Errno::from_raw(e.raw_os_error().unwrap_or(nix::libc::ENODEV))
159        })
160}
161
162/// Query information about the device with the given `devid` on the filesystem
163/// referred to by `fd`.
164///
165/// Returns `None` if no device with that ID exists (`ENODEV`).
166pub fn device_info(
167    fd: BorrowedFd,
168    devid: u64,
169) -> nix::Result<Option<DeviceInfo>> {
170    let mut raw: btrfs_ioctl_dev_info_args = unsafe { mem::zeroed() };
171    raw.devid = devid;
172
173    match unsafe { btrfs_ioc_dev_info(fd.as_raw_fd(), &mut raw) } {
174        Err(Errno::ENODEV) => return Ok(None),
175        Err(e) => return Err(e),
176        Ok(_) => {}
177    }
178
179    let path = unsafe { CStr::from_ptr(raw.path.as_ptr() as *const _) }
180        .to_string_lossy()
181        .into_owned();
182
183    Ok(Some(DeviceInfo {
184        devid: raw.devid,
185        uuid: Uuid::from_bytes(raw.uuid),
186        bytes_used: raw.bytes_used,
187        total_bytes: raw.total_bytes,
188        path,
189    }))
190}
191
192/// Query information about all devices in the filesystem referred to by `fd`,
193/// using the device count from a previously obtained [`FilesystemInfo`].
194///
195/// Iterates devids `1..=max_id`, skipping any that return `ENODEV` (holes in
196/// the devid space are normal when devices have been removed).
197pub fn device_info_all(
198    fd: BorrowedFd,
199    fs_info: &FilesystemInfo,
200) -> nix::Result<Vec<DeviceInfo>> {
201    let mut devices = Vec::with_capacity(fs_info.num_devices as usize);
202    for devid in 1..=fs_info.max_id {
203        if let Some(info) = device_info(fd, devid)? {
204            devices.push(info);
205        }
206    }
207    Ok(devices)
208}
209
210/// Add a device to the btrfs filesystem referred to by `fd`.
211///
212/// `path` must be the path to an unmounted block device. The kernel requires
213/// `CAP_SYS_ADMIN`.
214pub fn device_add(fd: BorrowedFd, path: &CStr) -> nix::Result<()> {
215    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
216    copy_path_to_name(&mut raw.name, path)?;
217    unsafe { btrfs_ioc_add_dev(fd.as_raw_fd(), &raw) }?;
218    Ok(())
219}
220
221/// Remove a device from the btrfs filesystem referred to by `fd`.
222///
223/// The device can be specified either by path or by its btrfs device ID via
224/// [`DeviceSpec`]. Uses `BTRFS_IOC_RM_DEV_V2` and falls back to the older
225/// `BTRFS_IOC_RM_DEV` ioctl on kernels that do not support the v2 variant
226/// (only possible when removing by path). The kernel requires `CAP_SYS_ADMIN`.
227///
228/// Errors: ENOTTY or EOPNOTSUPP from `RM_DEV_V2` triggers an automatic
229/// fallback to the v1 ioctl (path-based removal only; by-ID removal
230/// requires v2 and will propagate the error).  EBUSY if the device holds
231/// the only copy of some data and cannot be removed.
232pub fn device_remove(fd: BorrowedFd, spec: DeviceSpec) -> nix::Result<()> {
233    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
234
235    match spec {
236        DeviceSpec::Id(devid) => {
237            args.flags = BTRFS_DEVICE_SPEC_BY_ID as u64;
238            // SAFETY: devid is the active union member when BTRFS_DEVICE_SPEC_BY_ID is set.
239            args.__bindgen_anon_2.devid = devid;
240            unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &args) }?;
241        }
242        DeviceSpec::Path(path) => {
243            // SAFETY: name is the active union member when flags == 0.
244            unsafe {
245                copy_path_to_name(&mut args.__bindgen_anon_2.name, path)
246            }?;
247            match unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &args) } {
248                Ok(_) => {}
249                // Fall back to the old single-arg ioctl on kernels that either
250                // don't know about v2 (ENOTTY) or don't recognise our flags (EOPNOTSUPP).
251                Err(Errno::ENOTTY) | Err(Errno::EOPNOTSUPP) => {
252                    let mut old: btrfs_ioctl_vol_args =
253                        unsafe { mem::zeroed() };
254                    copy_path_to_name(&mut old.name, path)?;
255                    unsafe { btrfs_ioc_rm_dev(fd.as_raw_fd(), &old) }?;
256                }
257                Err(e) => return Err(e),
258            }
259        }
260    }
261
262    Ok(())
263}
264
265/// Register a block device with the kernel's btrfs device scanner so that
266/// multi-device filesystems containing it can be mounted.
267///
268/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_SCAN_DEV`. `path` must
269/// be the path to a block device that contains a btrfs filesystem member.
270pub fn device_scan(path: &CStr) -> nix::Result<()> {
271    let ctl = open_control()?;
272    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
273    copy_path_to_name(&mut raw.name, path)?;
274    unsafe { btrfs_ioc_scan_dev(ctl.as_raw_fd(), &raw) }?;
275    Ok(())
276}
277
278/// Unregister a device (or all stale devices) from the kernel's btrfs device
279/// scanner.
280///
281/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_FORGET_DEV`. If `path`
282/// is `None`, all devices that are not part of a currently mounted filesystem
283/// are unregistered. If `path` is `Some`, only that specific device path is
284/// unregistered.
285pub fn device_forget(path: Option<&CStr>) -> nix::Result<()> {
286    let ctl = open_control()?;
287    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
288    if let Some(p) = path {
289        copy_path_to_name(&mut raw.name, p)?;
290    }
291    unsafe { btrfs_ioc_forget_dev(ctl.as_raw_fd(), &raw) }?;
292    Ok(())
293}
294
295/// Check whether all member devices of the filesystem that contains `path`
296/// are available and the filesystem is ready to mount.
297///
298/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_DEVICES_READY`. `path`
299/// must be the path to one of the block devices belonging to the filesystem.
300/// Returns `Ok(())` when all devices are present; returns an error (typically
301/// `ENOENT` or `ENXIO`) if the set is incomplete.
302pub fn device_ready(path: &CStr) -> nix::Result<()> {
303    let ctl = open_control()?;
304    // BTRFS_IOC_DEVICES_READY is declared _IOR but the kernel reads the device
305    // path from args.name, so we pass a mut pointer as ioctl_read! requires.
306    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
307    copy_path_to_name(&mut raw.name, path)?;
308    unsafe { btrfs_ioc_devices_ready(ctl.as_raw_fd(), &mut raw) }?;
309    Ok(())
310}
311
312/// Query I/O error statistics for the device identified by `devid` within the
313/// filesystem referred to by `fd`.
314///
315/// If `reset` is `true`, the kernel atomically returns the current values and
316/// then resets all counters to zero. The kernel requires `CAP_SYS_ADMIN`.
317pub fn device_stats(
318    fd: BorrowedFd,
319    devid: u64,
320    reset: bool,
321) -> nix::Result<DeviceStats> {
322    let mut raw: btrfs_ioctl_get_dev_stats = unsafe { mem::zeroed() };
323    raw.devid = devid;
324    raw.nr_items = btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX as u64;
325    if reset {
326        raw.flags = BTRFS_DEV_STATS_RESET as u64;
327    }
328
329    unsafe { btrfs_ioc_get_dev_stats(fd.as_raw_fd(), &mut raw) }?;
330
331    Ok(DeviceStats {
332        devid,
333        write_errs: raw.values
334            [btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS as usize],
335        read_errs: raw.values
336            [btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS as usize],
337        flush_errs: raw.values
338            [btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS as usize],
339        corruption_errs: raw.values
340            [btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS as usize],
341        generation_errs: raw.values
342            [btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS as usize],
343    })
344}
345
346const DEV_EXTENT_LENGTH_OFF: usize =
347    std::mem::offset_of!(btrfs_dev_extent, length);
348
349const SZ_1M: u64 = 1024 * 1024;
350const SZ_32M: u64 = 32 * 1024 * 1024;
351
352/// Number of superblock mirror copies btrfs maintains.
353const BTRFS_SUPER_MIRROR_MAX: usize = 3;
354
355/// Return the byte offset of superblock mirror `i`.
356///
357/// Mirror 0 is at 64 KiB, mirror 1 at 64 MiB, mirror 2 at 256 GiB.
358fn sb_offset(i: usize) -> u64 {
359    match i {
360        0 => 64 * 1024,
361        _ => 1u64 << (20 + 10 * (i as u64)),
362    }
363}
364
365/// A contiguous physical byte range on a device (inclusive end).
366#[derive(Debug, Clone, Copy)]
367struct Extent {
368    start: u64,
369    /// Inclusive end byte.
370    end: u64,
371}
372
373/// Compute the minimum size to which device `devid` can be shrunk.
374///
375/// Walks the device tree for all `DEV_EXTENT_KEY` items belonging to
376/// `devid`, sums their lengths, then adjusts for extents that sit beyond
377/// the sum by checking whether they can be relocated into holes closer to
378/// the start of the device. The algorithm matches `btrfs inspect-internal
379/// min-dev-size` from btrfs-progs.
380///
381/// Requires `CAP_SYS_ADMIN`.
382pub fn device_min_size(fd: BorrowedFd, devid: u64) -> nix::Result<u64> {
383    let mut min_size: u64 = SZ_1M;
384    let mut extents: Vec<Extent> = Vec::new();
385    let mut holes: Vec<Extent> = Vec::new();
386    let mut last_pos: Option<u64> = None;
387
388    tree_search(
389        fd,
390        SearchKey::for_objectid_range(
391            BTRFS_DEV_TREE_OBJECTID as u64,
392            BTRFS_DEV_EXTENT_KEY,
393            devid,
394            devid,
395        ),
396        |hdr, data| {
397            if data.len()
398                < DEV_EXTENT_LENGTH_OFF + field_size!(btrfs_dev_extent, length)
399            {
400                return Ok(());
401            }
402            let phys_start = hdr.offset;
403            let len = read_le_u64(data, DEV_EXTENT_LENGTH_OFF);
404
405            min_size += len;
406
407            // Extents are prepended (descending end offset) so that the
408            // adjustment pass processes the highest-addressed extent first.
409            extents.push(Extent {
410                start: phys_start,
411                end: phys_start + len - 1,
412            });
413
414            if let Some(prev_end) = last_pos
415                && prev_end != phys_start
416            {
417                holes.push(Extent {
418                    start: prev_end,
419                    end: phys_start - 1,
420                });
421            }
422
423            last_pos = Some(phys_start + len);
424            Ok(())
425        },
426    )?;
427
428    // Sort extents by descending end offset for the adjustment pass.
429    extents.sort_by(|a, b| b.end.cmp(&a.end));
430
431    adjust_min_size(&mut extents, &mut holes, &mut min_size);
432
433    Ok(min_size)
434}
435
436/// Check whether a byte range `[start, end]` contains a superblock mirror.
437fn hole_includes_sb_mirror(start: u64, end: u64) -> bool {
438    (0..BTRFS_SUPER_MIRROR_MAX).any(|i| {
439        let bytenr = sb_offset(i);
440        bytenr >= start && bytenr <= end
441    })
442}
443
444/// Adjust `min_size` downward by relocating tail extents into holes.
445///
446/// Processes extents in descending order of end offset. If an extent sits
447/// beyond the current `min_size`, try to find a hole large enough to
448/// relocate it. If no hole fits, the device cannot be shrunk past that
449/// extent and `min_size` is set to its end + 1.
450///
451/// Adds scratch space (largest relocated extent + 32 MiB for a potential
452/// system chunk allocation) when any relocation is needed.
453fn adjust_min_size(
454    extents: &mut Vec<Extent>,
455    holes: &mut Vec<Extent>,
456    min_size: &mut u64,
457) {
458    let mut scratch_space: u64 = 0;
459
460    while let Some(&ext) = extents.first() {
461        if ext.end < *min_size {
462            break;
463        }
464
465        let extent_len = ext.end - ext.start + 1;
466
467        // Find the first hole large enough to hold this extent.
468        let hole_idx = holes.iter().position(|h| {
469            let hole_len = h.end - h.start + 1;
470            hole_len >= extent_len
471        });
472
473        let Some(idx) = hole_idx else {
474            *min_size = ext.end + 1;
475            break;
476        };
477
478        // If the target hole contains a superblock mirror location,
479        // pessimistically assume we need one more extent worth of space.
480        if hole_includes_sb_mirror(
481            holes[idx].start,
482            holes[idx].start + extent_len - 1,
483        ) {
484            *min_size += extent_len;
485        }
486
487        // Shrink or remove the hole.
488        let hole_len = holes[idx].end - holes[idx].start + 1;
489        if hole_len > extent_len {
490            holes[idx].start += extent_len;
491        } else {
492            holes.remove(idx);
493        }
494
495        extents.remove(0);
496
497        if extent_len > scratch_space {
498            scratch_space = extent_len;
499        }
500    }
501
502    if scratch_space > 0 {
503        *min_size += scratch_space;
504        // Chunk allocation may require a new system chunk (up to 32 MiB).
505        *min_size += SZ_32M;
506    }
507}
508
509fn read_le_u64(buf: &[u8], off: usize) -> u64 {
510    u64::from_le_bytes(buf[off..off + 8].try_into().unwrap())
511}