btrfs_uapi/device.rs
1//! # Device management: adding, removing, querying, and extent layout
2//!
3//! Covers adding and removing devices from a mounted filesystem, scanning a
4//! device to register it with the kernel, querying per-device I/O error
5//! statistics, checking whether all devices of a multi-device filesystem
6//! are present and ready, and computing minimum device sizes from the
7//! device extent tree.
8//!
9//! Most operations require `CAP_SYS_ADMIN`.
10
11use crate::{
12 filesystem::FilesystemInfo,
13 raw::{
14 BTRFS_DEV_EXTENT_KEY, BTRFS_DEV_STATS_RESET, BTRFS_DEV_TREE_OBJECTID,
15 BTRFS_DEVICE_SPEC_BY_ID,
16 btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS,
17 btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS,
18 btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS,
19 btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS,
20 btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX,
21 btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS, btrfs_ioc_add_dev,
22 btrfs_ioc_dev_info, btrfs_ioc_devices_ready, btrfs_ioc_forget_dev,
23 btrfs_ioc_get_dev_stats, btrfs_ioc_rm_dev, btrfs_ioc_rm_dev_v2,
24 btrfs_ioc_scan_dev, btrfs_ioctl_dev_info_args,
25 btrfs_ioctl_get_dev_stats, btrfs_ioctl_vol_args,
26 btrfs_ioctl_vol_args_v2,
27 },
28 tree_search::{SearchFilter, tree_search},
29};
30use nix::{errno::Errno, libc::c_char};
31use std::{
32 ffi::CStr,
33 fs::OpenOptions,
34 mem,
35 os::{fd::AsRawFd, unix::io::BorrowedFd},
36};
37use uuid::Uuid;
38
39/// Information about a single device within a btrfs filesystem, as returned
40/// by `BTRFS_IOC_DEV_INFO`.
41#[derive(Debug, Clone)]
42pub struct DeviceInfo {
43 /// Device ID.
44 pub devid: u64,
45 /// Device UUID.
46 pub uuid: Uuid,
47 /// Number of bytes used on this device.
48 pub bytes_used: u64,
49 /// Total size of this device in bytes.
50 pub total_bytes: u64,
51 /// Path to the block device, e.g. `/dev/sda`.
52 pub path: String,
53}
54
55/// Specifies a device for operations that can address by either path or ID.
56#[derive(Debug, Clone)]
57pub enum DeviceSpec<'a> {
58 /// A block device path (e.g. `/dev/sdb`), or the special strings
59 /// `"missing"` or `"cancel"` accepted by the remove ioctl.
60 Path(&'a CStr),
61 /// A btrfs device ID as reported by `BTRFS_IOC_DEV_INFO`.
62 Id(u64),
63}
64
65/// Per-device I/O error statistics, as returned by `BTRFS_IOC_GET_DEV_STATS`.
66#[derive(Debug, Clone, Copy, Default, PartialEq, Eq)]
67pub struct DeviceStats {
68 /// Device ID these stats belong to.
69 pub devid: u64,
70 /// Number of write I/O errors (EIO/EREMOTEIO from lower layers).
71 pub write_errs: u64,
72 /// Number of read I/O errors (EIO/EREMOTEIO from lower layers).
73 pub read_errs: u64,
74 /// Number of flush I/O errors (EIO/EREMOTEIO from lower layers).
75 pub flush_errs: u64,
76 /// Number of checksum or bytenr corruption errors detected on read.
77 pub corruption_errs: u64,
78 /// Number of generation errors (blocks not written where expected).
79 pub generation_errs: u64,
80}
81
82impl DeviceStats {
83 /// Sum of all error counters.
84 #[must_use]
85 pub fn total_errs(&self) -> u64 {
86 self.write_errs
87 + self.read_errs
88 + self.flush_errs
89 + self.corruption_errs
90 + self.generation_errs
91 }
92
93 /// Returns `true` if every counter is zero.
94 #[must_use]
95 pub fn is_clean(&self) -> bool {
96 self.total_errs() == 0
97 }
98}
99
100#[cfg(test)]
101mod tests {
102 use super::*;
103
104 #[test]
105 fn dev_stats_default_is_clean() {
106 let stats = DeviceStats::default();
107 assert!(stats.is_clean());
108 assert_eq!(stats.total_errs(), 0);
109 }
110
111 #[test]
112 fn dev_stats_total_errs() {
113 let stats = DeviceStats {
114 devid: 1,
115 write_errs: 1,
116 read_errs: 2,
117 flush_errs: 3,
118 corruption_errs: 4,
119 generation_errs: 5,
120 };
121 assert_eq!(stats.total_errs(), 15);
122 assert!(!stats.is_clean());
123 }
124
125 #[test]
126 fn dev_stats_single_error_not_clean() {
127 let stats = DeviceStats {
128 corruption_errs: 1,
129 ..DeviceStats::default()
130 };
131 assert!(!stats.is_clean());
132 assert_eq!(stats.total_errs(), 1);
133 }
134}
135
136/// Copy the bytes of `path` (without the nul terminator) into `name`,
137/// returning `ENAMETOOLONG` if the path (including the terminator that the
138/// kernel expects to already be present via zeroing) does not fit.
139#[allow(clippy::cast_possible_wrap)] // ASCII bytes always fit in c_char
140fn copy_path_to_name(name: &mut [c_char], path: &CStr) -> nix::Result<()> {
141 let bytes = path.to_bytes(); // excludes nul terminator
142 if bytes.len() >= name.len() {
143 return Err(Errno::ENAMETOOLONG);
144 }
145 for (i, &b) in bytes.iter().enumerate() {
146 name[i] = b as c_char;
147 }
148 // The remainder of `name` is already zeroed by the caller (mem::zeroed).
149 Ok(())
150}
151
152/// Open `/dev/btrfs-control` for read+write, mapping any `std::io::Error` to
153/// the appropriate `nix::errno::Errno`.
154fn open_control() -> nix::Result<std::fs::File> {
155 OpenOptions::new()
156 .read(true)
157 .write(true)
158 .open("/dev/btrfs-control")
159 .map_err(|e| {
160 Errno::from_raw(e.raw_os_error().unwrap_or(nix::libc::ENODEV))
161 })
162}
163
164/// Query information about the device with the given `devid` on the filesystem
165/// referred to by `fd`.
166///
167/// Returns `None` if no device with that ID exists (`ENODEV`).
168///
169/// # Errors
170///
171/// Returns `Err` if the ioctl fails (other than `ENODEV`).
172pub fn device_info(
173 fd: BorrowedFd,
174 devid: u64,
175) -> nix::Result<Option<DeviceInfo>> {
176 let mut raw: btrfs_ioctl_dev_info_args = unsafe { mem::zeroed() };
177 raw.devid = devid;
178
179 match unsafe { btrfs_ioc_dev_info(fd.as_raw_fd(), &raw mut raw) } {
180 Err(Errno::ENODEV) => return Ok(None),
181 Err(e) => return Err(e),
182 Ok(_) => {}
183 }
184
185 let path = unsafe { CStr::from_ptr(raw.path.as_ptr().cast()) }
186 .to_string_lossy()
187 .into_owned();
188
189 Ok(Some(DeviceInfo {
190 devid: raw.devid,
191 uuid: Uuid::from_bytes(raw.uuid),
192 bytes_used: raw.bytes_used,
193 total_bytes: raw.total_bytes,
194 path,
195 }))
196}
197
198/// Query information about all devices in the filesystem referred to by `fd`,
199/// using the device count from a previously obtained [`FilesystemInfo`].
200///
201/// Iterates devids `1..=max_id`, skipping any that return `ENODEV` (holes in
202/// the devid space are normal when devices have been removed).
203///
204/// # Errors
205///
206/// Returns `Err` if any device info ioctl fails.
207pub fn device_info_all(
208 fd: BorrowedFd,
209 fs_info: &FilesystemInfo,
210) -> nix::Result<Vec<DeviceInfo>> {
211 #[allow(clippy::cast_possible_truncation)]
212 // device count always fits in usize
213 let mut devices = Vec::with_capacity(fs_info.num_devices as usize);
214 for devid in 1..=fs_info.max_id {
215 if let Some(info) = device_info(fd, devid)? {
216 devices.push(info);
217 }
218 }
219 Ok(devices)
220}
221
222/// Add a device to the btrfs filesystem referred to by `fd`.
223///
224/// `path` must be the path to an unmounted block device. The kernel requires
225/// `CAP_SYS_ADMIN`.
226///
227/// # Errors
228///
229/// Returns `Err` if the ioctl fails.
230pub fn device_add(fd: BorrowedFd, path: &CStr) -> nix::Result<()> {
231 let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
232 copy_path_to_name(&mut raw.name, path)?;
233 unsafe { btrfs_ioc_add_dev(fd.as_raw_fd(), &raw const raw) }?;
234 Ok(())
235}
236
237/// Remove a device from the btrfs filesystem referred to by `fd`.
238///
239/// The device can be specified either by path or by its btrfs device ID via
240/// [`DeviceSpec`]. Uses `BTRFS_IOC_RM_DEV_V2` and falls back to the older
241/// `BTRFS_IOC_RM_DEV` ioctl on kernels that do not support the v2 variant
242/// (only possible when removing by path). The kernel requires `CAP_SYS_ADMIN`.
243///
244/// Errors: ENOTTY or EOPNOTSUPP from `RM_DEV_V2` triggers an automatic
245/// fallback to the v1 ioctl (path-based removal only; by-ID removal
246/// requires v2 and will propagate the error). `EBUSY` if the device holds
247/// the only copy of some data and cannot be removed.
248///
249/// # Errors
250///
251/// Returns `Err` if the remove ioctl fails.
252pub fn device_remove(fd: BorrowedFd, spec: &DeviceSpec<'_>) -> nix::Result<()> {
253 let mut args: btrfs_ioctl_vol_args_v2 = unsafe { mem::zeroed() };
254
255 match *spec {
256 DeviceSpec::Id(devid) => {
257 args.flags = u64::from(BTRFS_DEVICE_SPEC_BY_ID);
258 // SAFETY: devid is the active union member when BTRFS_DEVICE_SPEC_BY_ID is set.
259 args.__bindgen_anon_2.devid = devid;
260 unsafe { btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &raw const args) }?;
261 }
262 DeviceSpec::Path(path) => {
263 // SAFETY: name is the active union member when flags == 0.
264 unsafe {
265 copy_path_to_name(&mut args.__bindgen_anon_2.name, path)
266 }?;
267 match unsafe {
268 btrfs_ioc_rm_dev_v2(fd.as_raw_fd(), &raw const args)
269 } {
270 Ok(_) => {}
271 // Fall back to the old single-arg ioctl on kernels that either
272 // don't know about v2 (ENOTTY) or don't recognise our flags (EOPNOTSUPP).
273 Err(Errno::ENOTTY | Errno::EOPNOTSUPP) => {
274 let mut old: btrfs_ioctl_vol_args =
275 unsafe { mem::zeroed() };
276 copy_path_to_name(&mut old.name, path)?;
277 unsafe {
278 btrfs_ioc_rm_dev(fd.as_raw_fd(), &raw const old)
279 }?;
280 }
281 Err(e) => return Err(e),
282 }
283 }
284 }
285
286 Ok(())
287}
288
289/// Register a block device with the kernel's btrfs device scanner so that
290/// multi-device filesystems containing it can be mounted.
291///
292/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_SCAN_DEV`. `path` must
293/// be the path to a block device that contains a btrfs filesystem member.
294///
295/// # Errors
296///
297/// Returns `Err` if opening `/dev/btrfs-control` or the ioctl fails.
298pub fn device_scan(path: &CStr) -> nix::Result<()> {
299 let ctl = open_control()?;
300 let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
301 copy_path_to_name(&mut raw.name, path)?;
302 unsafe { btrfs_ioc_scan_dev(ctl.as_raw_fd(), &raw const raw) }?;
303 Ok(())
304}
305
306/// Unregister a device (or all stale devices) from the kernel's btrfs device
307/// scanner.
308///
309/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_FORGET_DEV`. If `path`
310/// is `None`, all devices that are not part of a currently mounted filesystem
311/// are unregistered. If `path` is `Some`, only that specific device path is
312/// unregistered.
313///
314/// # Errors
315///
316/// Returns `Err` if opening `/dev/btrfs-control` or the ioctl fails.
317pub fn device_forget(path: Option<&CStr>) -> nix::Result<()> {
318 let ctl = open_control()?;
319 let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
320 if let Some(p) = path {
321 copy_path_to_name(&mut raw.name, p)?;
322 }
323 unsafe { btrfs_ioc_forget_dev(ctl.as_raw_fd(), &raw const raw) }?;
324 Ok(())
325}
326
327/// Check whether all member devices of the filesystem that contains `path`
328/// are available and the filesystem is ready to mount.
329///
330/// Opens `/dev/btrfs-control` and issues `BTRFS_IOC_DEVICES_READY`. `path`
331/// must be the path to one of the block devices belonging to the filesystem.
332/// Returns `Ok(())` when all devices are present; returns an error (typically
333/// `ENOENT` or `ENXIO`) if the set is incomplete.
334///
335/// # Errors
336///
337/// Returns `Err` if some devices are missing or the ioctl fails.
338pub fn device_ready(path: &CStr) -> nix::Result<()> {
339 let ctl = open_control()?;
340 // BTRFS_IOC_DEVICES_READY is declared _IOR but the kernel reads the device
341 // path from args.name, so we pass a mut pointer as ioctl_read! requires.
342 let mut raw: btrfs_ioctl_vol_args = unsafe { mem::zeroed() };
343 copy_path_to_name(&mut raw.name, path)?;
344 unsafe { btrfs_ioc_devices_ready(ctl.as_raw_fd(), &raw mut raw) }?;
345 Ok(())
346}
347
348/// Query I/O error statistics for the device identified by `devid` within the
349/// filesystem referred to by `fd`.
350///
351/// If `reset` is `true`, the kernel atomically returns the current values and
352/// then resets all counters to zero. The kernel requires `CAP_SYS_ADMIN`.
353///
354/// # Errors
355///
356/// Returns `Err` if the ioctl fails.
357pub fn device_stats(
358 fd: BorrowedFd,
359 devid: u64,
360 reset: bool,
361) -> nix::Result<DeviceStats> {
362 let mut raw: btrfs_ioctl_get_dev_stats = unsafe { mem::zeroed() };
363 raw.devid = devid;
364 raw.nr_items = u64::from(btrfs_dev_stat_values_BTRFS_DEV_STAT_VALUES_MAX);
365 if reset {
366 raw.flags = u64::from(BTRFS_DEV_STATS_RESET);
367 }
368
369 unsafe { btrfs_ioc_get_dev_stats(fd.as_raw_fd(), &raw mut raw) }?;
370
371 Ok(DeviceStats {
372 devid,
373 write_errs: raw.values
374 [btrfs_dev_stat_values_BTRFS_DEV_STAT_WRITE_ERRS as usize],
375 read_errs: raw.values
376 [btrfs_dev_stat_values_BTRFS_DEV_STAT_READ_ERRS as usize],
377 flush_errs: raw.values
378 [btrfs_dev_stat_values_BTRFS_DEV_STAT_FLUSH_ERRS as usize],
379 corruption_errs: raw.values
380 [btrfs_dev_stat_values_BTRFS_DEV_STAT_CORRUPTION_ERRS as usize],
381 generation_errs: raw.values
382 [btrfs_dev_stat_values_BTRFS_DEV_STAT_GENERATION_ERRS as usize],
383 })
384}
385
386const SZ_1M: u64 = 1024 * 1024;
387const SZ_32M: u64 = 32 * 1024 * 1024;
388
389/// Number of superblock mirror copies btrfs maintains.
390const BTRFS_SUPER_MIRROR_MAX: usize = 3;
391
392/// Return the byte offset of superblock mirror `i`.
393///
394/// Mirror 0 is at 64 KiB, mirror 1 at 64 MiB, mirror 2 at 256 GiB.
395fn sb_offset(i: usize) -> u64 {
396 match i {
397 0 => 64 * 1024,
398 _ => 1u64 << (20 + 10 * (i as u64)),
399 }
400}
401
402/// A contiguous physical byte range on a device (inclusive end).
403#[derive(Debug, Clone, Copy)]
404struct Extent {
405 start: u64,
406 /// Inclusive end byte.
407 end: u64,
408}
409
410/// Compute the minimum size to which device `devid` can be shrunk.
411///
412/// Walks the device tree for all `DEV_EXTENT_KEY` items belonging to
413/// `devid`, sums their lengths, then adjusts for extents that sit beyond
414/// the sum by checking whether they can be relocated into holes closer to
415/// the start of the device. The algorithm matches `btrfs inspect-internal
416/// min-dev-size` from btrfs-progs.
417///
418/// Requires `CAP_SYS_ADMIN`.
419///
420/// # Errors
421///
422/// Returns `Err` if the tree search ioctl fails.
423pub fn device_min_size(fd: BorrowedFd, devid: u64) -> nix::Result<u64> {
424 let mut dev_extents: Vec<(u64, u64)> = Vec::new();
425
426 tree_search(
427 fd,
428 SearchFilter::for_objectid_range(
429 u64::from(BTRFS_DEV_TREE_OBJECTID),
430 BTRFS_DEV_EXTENT_KEY,
431 devid,
432 devid,
433 ),
434 |hdr, data| {
435 let Some(de) = btrfs_disk::items::DeviceExtent::parse(data) else {
436 return Ok(());
437 };
438 dev_extents.push((hdr.offset, de.length));
439 Ok(())
440 },
441 )?;
442
443 Ok(compute_min_size(&dev_extents))
444}
445
446/// Compute the minimum device size from a list of device extents.
447///
448/// Each entry is `(physical_start, length)`. The list must be sorted by
449/// ascending `physical_start` (as returned by the device tree).
450///
451/// The algorithm sums all extent lengths (plus 1 MiB base), then tries to
452/// relocate tail extents into holes to reduce the total. Matches the
453/// btrfs-progs `min-dev-size` logic.
454#[must_use]
455pub fn compute_min_size(dev_extents: &[(u64, u64)]) -> u64 {
456 let mut min_size: u64 = SZ_1M;
457 let mut extents: Vec<Extent> = Vec::new();
458 let mut holes: Vec<Extent> = Vec::new();
459 let mut last_pos: Option<u64> = None;
460
461 for &(phys_start, len) in dev_extents {
462 min_size += len;
463
464 extents.push(Extent {
465 start: phys_start,
466 end: phys_start + len - 1,
467 });
468
469 if let Some(prev_end) = last_pos
470 && prev_end != phys_start
471 {
472 holes.push(Extent {
473 start: prev_end,
474 end: phys_start - 1,
475 });
476 }
477
478 last_pos = Some(phys_start + len);
479 }
480
481 // Sort extents by descending end offset for the adjustment pass.
482 extents.sort_by(|a, b| b.end.cmp(&a.end));
483
484 adjust_min_size(&mut extents, &mut holes, &mut min_size);
485
486 min_size
487}
488
489/// Check whether a byte range `[start, end]` contains a superblock mirror.
490fn hole_includes_sb_mirror(start: u64, end: u64) -> bool {
491 (0..BTRFS_SUPER_MIRROR_MAX).any(|i| {
492 let bytenr = sb_offset(i);
493 bytenr >= start && bytenr <= end
494 })
495}
496
497/// Adjust `min_size` downward by relocating tail extents into holes.
498///
499/// Processes extents in descending order of end offset. If an extent sits
500/// beyond the current `min_size`, try to find a hole large enough to
501/// relocate it. If no hole fits, the device cannot be shrunk past that
502/// extent and `min_size` is set to its end + 1.
503///
504/// Adds scratch space (largest relocated extent + 32 MiB for a potential
505/// system chunk allocation) when any relocation is needed.
506fn adjust_min_size(
507 extents: &mut Vec<Extent>,
508 holes: &mut Vec<Extent>,
509 min_size: &mut u64,
510) {
511 let mut scratch_space: u64 = 0;
512
513 while let Some(&ext) = extents.first() {
514 if ext.end < *min_size {
515 break;
516 }
517
518 let extent_len = ext.end - ext.start + 1;
519
520 // Find the first hole large enough to hold this extent.
521 let hole_idx = holes.iter().position(|h| {
522 let hole_len = h.end - h.start + 1;
523 hole_len >= extent_len
524 });
525
526 let Some(idx) = hole_idx else {
527 *min_size = ext.end + 1;
528 break;
529 };
530
531 // If the target hole contains a superblock mirror location,
532 // pessimistically assume we need one more extent worth of space.
533 if hole_includes_sb_mirror(
534 holes[idx].start,
535 holes[idx].start + extent_len - 1,
536 ) {
537 *min_size += extent_len;
538 }
539
540 // Shrink or remove the hole.
541 let hole_len = holes[idx].end - holes[idx].start + 1;
542 if hole_len > extent_len {
543 holes[idx].start += extent_len;
544 } else {
545 holes.remove(idx);
546 }
547
548 extents.remove(0);
549
550 if extent_len > scratch_space {
551 scratch_space = extent_len;
552 }
553 }
554
555 if scratch_space > 0 {
556 *min_size += scratch_space;
557 // Chunk allocation may require a new system chunk (up to 32 MiB).
558 *min_size += SZ_32M;
559 }
560}