Skip to main content

btrfs_uapi/
device.rs

1//! # Device management: adding, removing, querying, and extent layout
2//!
3//! Covers adding and removing devices from a mounted filesystem, scanning a
4//! device to register it with the kernel, querying per-device I/O error
5//! statistics, checking whether all devices of a multi-device filesystem
6//! are present and ready, and computing minimum device sizes from the
7//! device extent tree.
8//!
9//! Most operations require `CAP_SYS_ADMIN`.
10
11use crate::{
12    field_size,
13    filesystem::FilesystemInfo,
14    raw::{
15        BTRFS_DEV_EXTENT_KEY, BTRFS_DEV_STATS_RESET, BTRFS_DEV_TREE_OBJECTID,
16        BTRFS_DEVICE_SPEC_BY_ID, btrfs_dev_extent,
17        btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS,
18        btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS,
19        btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS,
20        btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS,
21        btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX,
22        btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS, btrfs_ioc_add_dev,
23        btrfs_ioc_dev_info, btrfs_ioc_devices_ready, btrfs_ioc_forget_dev,
24        btrfs_ioc_get_dev_stats, btrfs_ioc_rm_dev, btrfs_ioc_rm_dev_v2,
25        btrfs_ioc_scan_dev, btrfs_ioctl_dev_info_args,
26        btrfs_ioctl_get_dev_stats, btrfs_ioctl_vol_args,
27        btrfs_ioctl_vol_args_v2,
28    },
29    tree_search::{SearchKey, tree_search},
30};
31use nix::{errno::Errno, libc::c_char};
32use std::{
33    ffi::CStr,
34    fs::OpenOptions,
35    mem,
36    os::{fd::AsRawFd, unix::io::BorrowedFd},
37};
38use uuid::Uuid;
39
40/// Information about a single device within a btrfs filesystem, as returned
41/// by `BTRFS_IOC_DEV_INFO`.
42#[derive(Debug, Clone)]
43pub struct DeviceInfo {
44    /// Device ID.
45    pub devid: u64,
46    /// Device UUID.
47    pub uuid: Uuid,
48    /// Number of bytes used on this device.
49    pub bytes_used: u64,
50    /// Total size of this device in bytes.
51    pub total_bytes: u64,
52    /// Path to the block device, e.g. `/dev/sda`.
53    pub path: String,
54}
55
56/// Specifies a device for operations that can address by either path or ID.
57#[derive(Debug, Clone)]
58pub enum DeviceSpec<'a> {
59    /// A block device path (e.g. `/dev/sdb`), or the special strings
60    /// `"missing"` or `"cancel"` accepted by the remove ioctl.
61    Path(&'a CStr),
62    /// A btrfs device ID as reported by `BTRFS_IOC_DEV_INFO`.
63    Id(u64),
64}
65
66/// Per-device I/O error statistics, as returned by `BTRFS_IOC_GET_DEV_STATS`.
67#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
68pub struct DeviceStats {
69    /// Device ID these stats belong to.
70    pub devid: u64,
71    /// Number of write I/O errors (EIO/EREMOTEIO from lower layers).
72    pub write_errs: u64,
73    /// Number of read I/O errors (EIO/EREMOTEIO from lower layers).
74    pub read_errs: u64,
75    /// Number of flush I/O errors (EIO/EREMOTEIO from lower layers).
76    pub flush_errs: u64,
77    /// Number of checksum or bytenr corruption errors detected on read.
78    pub corruption_errs: u64,
79    /// Number of generation errors (blocks not written where expected).
80    pub generation_errs: u64,
81}
82
83impl DeviceStats {
84    /// Sum of all error counters.
85    pub fn total_errs(&self) -> u64 {
86        self.write_errs
87            + self.read_errs
88            + self.flush_errs
89            + self.corruption_errs
90            + self.generation_errs
91    }
92
93    /// Returns `true` if every counter is zero.
94    pub fn is_clean(&self) -> bool {
95        self.total_errs() == 0
96    }
97}
98
99#[cfg(test)]
100mod tests {
101    use super::*;
102
103    #[test]
104    fn dev_stats_default_is_clean() {
105        let stats = DeviceStats::default();
106        assert!(stats.is_clean());
107        assert_eq!(stats.total_errs(), 0);
108    }
109
110    #[test]
111    fn dev_stats_total_errs() {
112        let stats = DeviceStats {
113            devid: 1,
114            write_errs: 1,
115            read_errs: 2,
116            flush_errs: 3,
117            corruption_errs: 4,
118            generation_errs: 5,
119        };
120        assert_eq!(stats.total_errs(), 15);
121        assert!(!stats.is_clean());
122    }
123
124    #[test]
125    fn dev_stats_single_error_not_clean() {
126        let stats = DeviceStats {
127            corruption_errs: 1,
128            ..DeviceStats::default()
129        };
130        assert!(!stats.is_clean());
131        assert_eq!(stats.total_errs(), 1);
132    }
133}
134
135/// Copy the bytes of `path` (without the nul terminator) into `name`,
136/// returning `ENAMETOOLONG` if the path (including the terminator that the
137/// kernel expects to already be present via zeroing) does not fit.
138fn copy_path_to_name(name: &mut [c_char], path: &CStr) -> nix::Result<()> {
139    let bytes = path.to_bytes(); // excludes nul terminator
140    if bytes.len() >= name.len() {
141        return Err(Errno::ENAMETOOLONG);
142    }
143    for (i, &b) in bytes.iter().enumerate() {
144        name[i] = b as c_char;
145    }
146    // The remainder of `name` is already zeroed by the caller (mem::zeroed).
147    Ok(())
148}
149
150/// Open `/dev/btrfs-control` for read+write, mapping any `std::io::Error` to
151/// the appropriate `nix::errno::Errno`.
152fn open_control() -> nix::Result<std::fs::File> {
153    OpenOptions::new()
154        .read(true)
155        .write(true)
156        .open("/dev/btrfs-control")
157        .map_err(|e| {
158            Errno::from_raw(e.raw_os_error().unwrap_or(nix::libc::ENODEV))
159        })
160}
161
162/// Query information about the device with the given `devid` on the filesystem
163/// referred to by `fd`.
164///
165/// Returns `None` if no device with that ID exists (`ENODEV`).
166pub fn device_info(
167    fd: BorrowedFd,
168    devid: u64,
169) -> nix::Result<Option<DeviceInfo>> {
170    let mut raw: btrfs_ioctl_dev_info_args = unsafe { mem::zeroed() };
171    raw.devid = devid;
172
173    match unsafe { btrfs_ioc_dev_info(fd.as_raw_fd(), &mut raw) } {
174        Err(Errno::ENODEV) => return Ok(None),
175        Err(e) => return Err(e),
176        Ok(_) => {}
177    }
178
179    let path = unsafe { CStr::from_ptr(raw.path.as_ptr() as *const _) }
180        .to_string_lossy()
181        .into_owned();
182
183    Ok(Some(DeviceInfo {
184        devid: raw.devid,
185        uuid: Uuid::from_bytes(raw.uuid),
186        bytes_used: raw.bytes_used,
187        total_bytes: raw.total_bytes,
188        path,
189    }))
190}
191
192/// Query information about all devices in the filesystem referred to by `fd`,
193/// using the device count from a previously obtained [`FilesystemInfo`].
194///
195/// Iterates devids `1..=max_id`, skipping any that return `ENODEV` (holes in
196/// the devid space are normal when devices have been removed).
197pub fn device_info_all(
198    fd: BorrowedFd,
199    fs_info: &FilesystemInfo,
200) -> nix::Result<Vec<DeviceInfo>> {
201    let mut devices = Vec::with_capacity(fs_info.num_devices as usize);
202    for devid in 1..=fs_info.max_id {
203        if let Some(info) = device_info(fd, devid)? {
204            devices.push(info);
205        }
206    }
207    Ok(devices)
208}
209
210/// Add a device to the btrfs filesystem referred to by `fd`.
211///
212/// `path` must be the path to an unmounted block device. The kernel requires
213/// `CAP_SYS_ADMIN`.
214pub fn device_add(fd: BorrowedFd, path: &CStr) -> nix::Result<()> {
215    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
216    copy_path_to_name(&mut raw.name, path)?;
217    unsafe { btrfs_ioc_add_dev(fd.as_raw_fd(), &raw) }?;
218    Ok(())
219}
220
221/// Remove a device from the btrfs filesystem referred to by `fd`.
222///
223/// The device can be specified either by path or by its btrfs device ID via
224/// [`DeviceSpec`]. Uses `BTRFS_IOC_RM_DEV_V2` and falls back to the older
225/// `BTRFS_IOC_RM_DEV` ioctl on kernels that do not support the v2 variant
226/// (only possible when removing by path). The kernel requires `CAP_SYS_ADMIN`.
227pub fn device_remove(fd: BorrowedFd, spec: DeviceSpec) -> nix::Result<()> {
228    let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
229
230    match spec {
231        DeviceSpec::Id(devid) => {
232            args.flags = BTRFS_DEVICE_SPEC_BY_ID as u64;
233            // SAFETY: devid is the active union member when BTRFS_DEVICE_SPEC_BY_ID is set.
234            args.__bindgen_anon_2.devid = devid;
235            unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &args) }?;
236        }
237        DeviceSpec::Path(path) => {
238            // SAFETY: name is the active union member when flags == 0.
239            unsafe {
240                copy_path_to_name(&mut args.__bindgen_anon_2.name, path)
241            }?;
242            match unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &args) } {
243                Ok(_) => {}
244                // Fall back to the old single-arg ioctl on kernels that either
245                // don't know about v2 (ENOTTY) or don't recognise our flags (EOPNOTSUPP).
246                Err(Errno::ENOTTY) | Err(Errno::EOPNOTSUPP) => {
247                    let mut old: btrfs_ioctl_vol_args =
248                        unsafe { mem::zeroed() };
249                    copy_path_to_name(&mut old.name, path)?;
250                    unsafe { btrfs_ioc_rm_dev(fd.as_raw_fd(), &old) }?;
251                }
252                Err(e) => return Err(e),
253            }
254        }
255    }
256
257    Ok(())
258}
259
260/// Register a block device with the kernel's btrfs device scanner so that
261/// multi-device filesystems containing it can be mounted.
262///
263/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_SCAN_DEV`. `path` must
264/// be the path to a block device that contains a btrfs filesystem member.
265pub fn device_scan(path: &CStr) -> nix::Result<()> {
266    let ctl = open_control()?;
267    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
268    copy_path_to_name(&mut raw.name, path)?;
269    unsafe { btrfs_ioc_scan_dev(ctl.as_raw_fd(), &raw) }?;
270    Ok(())
271}
272
273/// Unregister a device (or all stale devices) from the kernel's btrfs device
274/// scanner.
275///
276/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_FORGET_DEV`. If `path`
277/// is `None`, all devices that are not part of a currently mounted filesystem
278/// are unregistered. If `path` is `Some`, only that specific device path is
279/// unregistered.
280pub fn device_forget(path: Option<&CStr>) -> nix::Result<()> {
281    let ctl = open_control()?;
282    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
283    if let Some(p) = path {
284        copy_path_to_name(&mut raw.name, p)?;
285    }
286    unsafe { btrfs_ioc_forget_dev(ctl.as_raw_fd(), &raw) }?;
287    Ok(())
288}
289
290/// Check whether all member devices of the filesystem that contains `path`
291/// are available and the filesystem is ready to mount.
292///
293/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_DEVICES_READY`. `path`
294/// must be the path to one of the block devices belonging to the filesystem.
295/// Returns `Ok(())` when all devices are present; returns an error (typically
296/// `ENOENT` or `ENXIO`) if the set is incomplete.
297pub fn device_ready(path: &CStr) -> nix::Result<()> {
298    let ctl = open_control()?;
299    // BTRFS_IOC_DEVICES_READY is declared _IOR but the kernel reads the device
300    // path from args.name, so we pass a mut pointer as ioctl_read! requires.
301    let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
302    copy_path_to_name(&mut raw.name, path)?;
303    unsafe { btrfs_ioc_devices_ready(ctl.as_raw_fd(), &mut raw) }?;
304    Ok(())
305}
306
307/// Query I/O error statistics for the device identified by `devid` within the
308/// filesystem referred to by `fd`.
309///
310/// If `reset` is `true`, the kernel atomically returns the current values and
311/// then resets all counters to zero. The kernel requires `CAP_SYS_ADMIN`.
312pub fn device_stats(
313    fd: BorrowedFd,
314    devid: u64,
315    reset: bool,
316) -> nix::Result<DeviceStats> {
317    let mut raw: btrfs_ioctl_get_dev_stats = unsafe { mem::zeroed() };
318    raw.devid = devid;
319    raw.nr_items = btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX as u64;
320    if reset {
321        raw.flags = BTRFS_DEV_STATS_RESET as u64;
322    }
323
324    unsafe { btrfs_ioc_get_dev_stats(fd.as_raw_fd(), &mut raw) }?;
325
326    Ok(DeviceStats {
327        devid,
328        write_errs: raw.values
329            [btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS as usize],
330        read_errs: raw.values
331            [btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS as usize],
332        flush_errs: raw.values
333            [btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS as usize],
334        corruption_errs: raw.values
335            [btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS as usize],
336        generation_errs: raw.values
337            [btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS as usize],
338    })
339}
340
341const DEV_EXTENT_LENGTH_OFF: usize =
342    std::mem::offset_of!(btrfs_dev_extent, length);
343
344const SZ_1M: u64 = 1024 * 1024;
345const SZ_32M: u64 = 32 * 1024 * 1024;
346
347/// Number of superblock mirror copies btrfs maintains.
348const BTRFS_SUPER_MIRROR_MAX: usize = 3;
349
350/// Return the byte offset of superblock mirror `i`.
351///
352/// Mirror 0 is at 64 KiB, mirror 1 at 64 MiB, mirror 2 at 256 GiB.
353fn sb_offset(i: usize) -> u64 {
354    match i {
355        0 => 64 * 1024,
356        _ => 1u64 << (20 + 10 * (i as u64)),
357    }
358}
359
360/// A contiguous physical byte range on a device (inclusive end).
361#[derive(Debug, Clone, Copy)]
362struct Extent {
363    start: u64,
364    /// Inclusive end byte.
365    end: u64,
366}
367
368/// Compute the minimum size to which device `devid` can be shrunk.
369///
370/// Walks the device tree for all `DEV_EXTENT_KEY` items belonging to
371/// `devid`, sums their lengths, then adjusts for extents that sit beyond
372/// the sum by checking whether they can be relocated into holes closer to
373/// the start of the device. The algorithm matches `btrfs inspect-internal
374/// min-dev-size` from btrfs-progs.
375///
376/// Requires `CAP_SYS_ADMIN`.
377pub fn device_min_size(fd: BorrowedFd, devid: u64) -> nix::Result<u64> {
378    let mut min_size: u64 = SZ_1M;
379    let mut extents: Vec<Extent> = Vec::new();
380    let mut holes: Vec<Extent> = Vec::new();
381    let mut last_pos: Option<u64> = None;
382
383    tree_search(
384        fd,
385        SearchKey::for_objectid_range(
386            BTRFS_DEV_TREE_OBJECTID as u64,
387            BTRFS_DEV_EXTENT_KEY,
388            devid,
389            devid,
390        ),
391        |hdr, data| {
392            if data.len()
393                < DEV_EXTENT_LENGTH_OFF + field_size!(btrfs_dev_extent, length)
394            {
395                return Ok(());
396            }
397            let phys_start = hdr.offset;
398            let len = read_le_u64(data, DEV_EXTENT_LENGTH_OFF);
399
400            min_size += len;
401
402            // Extents are prepended (descending end offset) so that the
403            // adjustment pass processes the highest-addressed extent first.
404            extents.push(Extent {
405                start: phys_start,
406                end: phys_start + len - 1,
407            });
408
409            if let Some(prev_end) = last_pos {
410                if prev_end != phys_start {
411                    holes.push(Extent {
412                        start: prev_end,
413                        end: phys_start - 1,
414                    });
415                }
416            }
417
418            last_pos = Some(phys_start + len);
419            Ok(())
420        },
421    )?;
422
423    // Sort extents by descending end offset for the adjustment pass.
424    extents.sort_by(|a, b| b.end.cmp(&a.end));
425
426    adjust_min_size(&mut extents, &mut holes, &mut min_size);
427
428    Ok(min_size)
429}
430
431/// Check whether a byte range `[start, end]` contains a superblock mirror.
432fn hole_includes_sb_mirror(start: u64, end: u64) -> bool {
433    (0..BTRFS_SUPER_MIRROR_MAX).any(|i| {
434        let bytenr = sb_offset(i);
435        bytenr >= start && bytenr <= end
436    })
437}
438
439/// Adjust `min_size` downward by relocating tail extents into holes.
440///
441/// Processes extents in descending order of end offset. If an extent sits
442/// beyond the current `min_size`, try to find a hole large enough to
443/// relocate it. If no hole fits, the device cannot be shrunk past that
444/// extent and `min_size` is set to its end + 1.
445///
446/// Adds scratch space (largest relocated extent + 32 MiB for a potential
447/// system chunk allocation) when any relocation is needed.
448fn adjust_min_size(
449    extents: &mut Vec<Extent>,
450    holes: &mut Vec<Extent>,
451    min_size: &mut u64,
452) {
453    let mut scratch_space: u64 = 0;
454
455    while let Some(&ext) = extents.first() {
456        if ext.end < *min_size {
457            break;
458        }
459
460        let extent_len = ext.end - ext.start + 1;
461
462        // Find the first hole large enough to hold this extent.
463        let hole_idx = holes.iter().position(|h| {
464            let hole_len = h.end - h.start + 1;
465            hole_len >= extent_len
466        });
467
468        let Some(idx) = hole_idx else {
469            *min_size = ext.end + 1;
470            break;
471        };
472
473        // If the target hole contains a superblock mirror location,
474        // pessimistically assume we need one more extent worth of space.
475        if hole_includes_sb_mirror(
476            holes[idx].start,
477            holes[idx].start + extent_len - 1,
478        ) {
479            *min_size += extent_len;
480        }
481
482        // Shrink or remove the hole.
483        let hole_len = holes[idx].end - holes[idx].start + 1;
484        if hole_len > extent_len {
485            holes[idx].start += extent_len;
486        } else {
487            holes.remove(idx);
488        }
489
490        extents.remove(0);
491
492        if extent_len > scratch_space {
493            scratch_space = extent_len;
494        }
495    }
496
497    if scratch_space > 0 {
498        *min_size += scratch_space;
499        // Chunk allocation may require a new system chunk (up to 32 MiB).
500        *min_size += SZ_32M;
501    }
502}
503
504fn read_le_u64(buf: &[u8], off: usize) -> u64 {
505    u64::from_le_bytes(buf[off..off + 8].try_into().unwrap())
506}