Skip to main content

btrfs_uapi/
quota.rs

1//! # Quota and qgroup management: enabling quotas and tracking disk usage
2//!
3//! Quota accounting tracks disk usage per subvolume via qgroups (quota groups).
4//! It must be explicitly enabled before any qgroup limits or usage data are
5//! available.  Once enabled, usage numbers are maintained incrementally by the
6//! kernel; a rescan rebuilds them from scratch if they become inconsistent.
7//!
8//! Every subvolume automatically gets a level-0 qgroup whose ID matches the
9//! subvolume ID.  Higher-level qgroups can be created and linked into a
10//! parent-child hierarchy so that space usage rolls up through the tree.
11//!
12//! Quota status (whether quotas are on, which mode, inconsistency flag) is
13//! read from sysfs via [`crate::sysfs::SysfsBtrfs::quota_status`].
14//!
15//! Most operations require `CAP_SYS_ADMIN`.
16
17use crate::{
18    field_size,
19    raw::{
20        BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID,
21        BTRFS_QGROUP_INFO_KEY, BTRFS_QGROUP_LIMIT_EXCL_CMPR,
22        BTRFS_QGROUP_LIMIT_KEY, BTRFS_QGROUP_LIMIT_MAX_EXCL,
23        BTRFS_QGROUP_LIMIT_MAX_RFER, BTRFS_QGROUP_LIMIT_RFER_CMPR,
24        BTRFS_QGROUP_RELATION_KEY, BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT,
25        BTRFS_QGROUP_STATUS_FLAG_ON, BTRFS_QGROUP_STATUS_FLAG_RESCAN,
26        BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE, BTRFS_QGROUP_STATUS_KEY,
27        BTRFS_QUOTA_CTL_DISABLE, BTRFS_QUOTA_CTL_ENABLE,
28        BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA, BTRFS_QUOTA_TREE_OBJECTID,
29        BTRFS_ROOT_ITEM_KEY, BTRFS_ROOT_TREE_OBJECTID, btrfs_ioc_qgroup_assign,
30        btrfs_ioc_qgroup_create, btrfs_ioc_qgroup_limit, btrfs_ioc_quota_ctl,
31        btrfs_ioc_quota_rescan, btrfs_ioc_quota_rescan_status,
32        btrfs_ioc_quota_rescan_wait, btrfs_ioctl_qgroup_assign_args,
33        btrfs_ioctl_qgroup_create_args, btrfs_ioctl_qgroup_limit_args,
34        btrfs_ioctl_quota_ctl_args, btrfs_ioctl_quota_rescan_args,
35        btrfs_qgroup_info_item, btrfs_qgroup_limit, btrfs_qgroup_limit_item,
36        btrfs_qgroup_status_item,
37    },
38    tree_search::{SearchKey, tree_search},
39    util::read_le_u64,
40};
41use bitflags::bitflags;
42use nix::errno::Errno;
43use std::{
44    collections::{HashMap, HashSet},
45    mem::{self, offset_of, size_of},
46    os::{fd::AsRawFd, unix::io::BorrowedFd},
47};
48
49/// Enable quota accounting on the filesystem referred to by `fd`.
50///
51/// When `simple` is `true`, uses `BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA`, which
52/// accounts for extent ownership by lifetime rather than backref walks. This is
53/// faster but less precise than full qgroup accounting.
54pub fn quota_enable(fd: BorrowedFd, simple: bool) -> nix::Result<()> {
55    let cmd = if simple {
56        BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA as u64
57    } else {
58        BTRFS_QUOTA_CTL_ENABLE as u64
59    };
60    let mut args: btrfs_ioctl_quota_ctl_args = unsafe { mem::zeroed() };
61    args.cmd = cmd;
62    unsafe { btrfs_ioc_quota_ctl(fd.as_raw_fd(), &mut args) }?;
63    Ok(())
64}
65
66/// Disable quota accounting on the filesystem referred to by `fd`.
67pub fn quota_disable(fd: BorrowedFd) -> nix::Result<()> {
68    let mut args: btrfs_ioctl_quota_ctl_args = unsafe { mem::zeroed() };
69    args.cmd = BTRFS_QUOTA_CTL_DISABLE as u64;
70    unsafe { btrfs_ioc_quota_ctl(fd.as_raw_fd(), &mut args) }?;
71    Ok(())
72}
73
74/// Start a quota rescan on the filesystem referred to by `fd`.
75///
76/// Returns immediately after kicking off the background scan. Use
77/// [`quota_rescan_wait`] to block until it finishes. If a rescan is already
78/// in progress the kernel returns `EINPROGRESS`; callers that are about to
79/// wait anyway can treat that as a non-error.
80pub fn quota_rescan(fd: BorrowedFd) -> nix::Result<()> {
81    let args: btrfs_ioctl_quota_rescan_args = unsafe { mem::zeroed() };
82    unsafe { btrfs_ioc_quota_rescan(fd.as_raw_fd(), &args) }?;
83    Ok(())
84}
85
86/// Block until the quota rescan currently running on the filesystem referred
87/// to by `fd` completes. Returns immediately if no rescan is in progress.
88pub fn quota_rescan_wait(fd: BorrowedFd) -> nix::Result<()> {
89    unsafe { btrfs_ioc_quota_rescan_wait(fd.as_raw_fd()) }?;
90    Ok(())
91}
92
93/// Status of an in-progress (or absent) quota rescan.
94#[derive(Debug, Clone, PartialEq, Eq)]
95pub struct QuotaRescanStatus {
96    /// Whether a rescan is currently running.
97    pub running: bool,
98    /// Object ID of the most recently scanned tree item. Only meaningful
99    /// when `running` is `true`.
100    pub progress: u64,
101}
102
103/// Query the status of the quota rescan on the filesystem referred to by `fd`.
104pub fn quota_rescan_status(fd: BorrowedFd) -> nix::Result<QuotaRescanStatus> {
105    let mut args: btrfs_ioctl_quota_rescan_args = unsafe { mem::zeroed() };
106    unsafe { btrfs_ioc_quota_rescan_status(fd.as_raw_fd(), &mut args) }?;
107    Ok(QuotaRescanStatus {
108        running: args.flags != 0,
109        progress: args.progress,
110    })
111}
112
113/// Extract the hierarchy level from a packed qgroup ID.
114///
115/// `qgroupid = (level << 48) | subvolid`.  Level 0 qgroups correspond
116/// directly to subvolumes.
117#[inline]
118pub fn qgroupid_level(qgroupid: u64) -> u16 {
119    (qgroupid >> 48) as u16
120}
121
122/// Extract the subvolume ID component from a packed qgroup ID.
123///
124/// Only meaningful for level-0 qgroups.
125#[inline]
126pub fn qgroupid_subvolid(qgroupid: u64) -> u64 {
127    qgroupid & 0x0000_FFFF_FFFF_FFFF
128}
129
130bitflags! {
131    /// Status flags for the quota tree as a whole (`BTRFS_QGROUP_STATUS_KEY`).
132    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
133    pub struct QgroupStatusFlags: u64 {
134        /// Quota accounting is enabled.
135        const ON           = BTRFS_QGROUP_STATUS_FLAG_ON as u64;
136        /// A rescan is currently in progress.
137        const RESCAN       = BTRFS_QGROUP_STATUS_FLAG_RESCAN as u64;
138        /// Accounting is inconsistent and a rescan is needed.
139        const INCONSISTENT = BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT as u64;
140        /// Simple quota mode (squota) is active.
141        const SIMPLE_MODE  = BTRFS_QGROUP_STATUS_FLAG_SIMPLE_MODE as u64;
142    }
143}
144
145bitflags! {
146    /// Which limit fields are actively enforced on a qgroup.
147    #[derive(Debug, Clone, Copy, PartialEq, Eq)]
148    pub struct QgroupLimitFlags: u64 {
149        /// `max_rfer` (maximum referenced bytes) is enforced.
150        const MAX_RFER  = BTRFS_QGROUP_LIMIT_MAX_RFER as u64;
151        /// `max_excl` (maximum exclusive bytes) is enforced.
152        const MAX_EXCL  = BTRFS_QGROUP_LIMIT_MAX_EXCL as u64;
153        /// Referenced bytes are compressed before comparison.
154        const RFER_CMPR = BTRFS_QGROUP_LIMIT_RFER_CMPR as u64;
155        /// Exclusive bytes are compressed before comparison.
156        const EXCL_CMPR = BTRFS_QGROUP_LIMIT_EXCL_CMPR as u64;
157    }
158}
159
160/// Usage and limit information for a single qgroup.
161#[derive(Debug, Clone)]
162pub struct QgroupInfo {
163    /// Packed qgroup ID: `(level << 48) | subvolid`.
164    pub qgroupid: u64,
165    /// Total referenced bytes (includes shared data).
166    pub rfer: u64,
167    /// Referenced bytes after compression.
168    pub rfer_cmpr: u64,
169    /// Exclusively-owned bytes (not shared with any other subvolume).
170    pub excl: u64,
171    /// Exclusively-owned bytes after compression.
172    pub excl_cmpr: u64,
173    /// Limit flags — which of the limit fields below are enforced.
174    pub limit_flags: QgroupLimitFlags,
175    /// Maximum referenced bytes.  `u64::MAX` when no limit is set.
176    pub max_rfer: u64,
177    /// Maximum exclusive bytes.  `u64::MAX` when no limit is set.
178    pub max_excl: u64,
179    /// IDs of qgroups that are parents of this one in the hierarchy.
180    pub parents: Vec<u64>,
181    /// IDs of qgroups that are children of this one in the hierarchy.
182    pub children: Vec<u64>,
183    /// Level-0 only: `true` when the corresponding subvolume no longer
184    /// exists (this is a "stale" qgroup left behind after deletion).
185    pub stale: bool,
186}
187
188/// Result of [`qgroup_list`]: overall quota status and per-qgroup details.
189#[derive(Debug, Clone)]
190pub struct QgroupList {
191    /// Flags from the `BTRFS_QGROUP_STATUS_KEY` item.
192    pub status_flags: QgroupStatusFlags,
193    /// All qgroups found in the quota tree, sorted by `qgroupid`.
194    pub qgroups: Vec<QgroupInfo>,
195}
196
197#[derive(Default)]
198struct QgroupEntryBuilder {
199    // From INFO item
200    has_info: bool,
201    rfer: u64,
202    rfer_cmpr: u64,
203    excl: u64,
204    excl_cmpr: u64,
205    // From LIMIT item
206    has_limit: bool,
207    limit_flags: u64,
208    max_rfer: u64,
209    max_excl: u64,
210    // From RELATION items
211    parents: Vec<u64>,
212    children: Vec<u64>,
213}
214
215impl QgroupEntryBuilder {
216    fn build(self, qgroupid: u64, stale: bool) -> QgroupInfo {
217        QgroupInfo {
218            qgroupid,
219            rfer: self.rfer,
220            rfer_cmpr: self.rfer_cmpr,
221            excl: self.excl,
222            excl_cmpr: self.excl_cmpr,
223            limit_flags: QgroupLimitFlags::from_bits_truncate(self.limit_flags),
224            max_rfer: if self.limit_flags & BTRFS_QGROUP_LIMIT_MAX_RFER as u64
225                != 0
226            {
227                self.max_rfer
228            } else {
229                u64::MAX
230            },
231            max_excl: if self.limit_flags & BTRFS_QGROUP_LIMIT_MAX_EXCL as u64
232                != 0
233            {
234                self.max_excl
235            } else {
236                u64::MAX
237            },
238            parents: self.parents,
239            children: self.children,
240            stale,
241        }
242    }
243}
244
245fn parse_status_flags(data: &[u8]) -> Option<u64> {
246    let off = offset_of!(btrfs_qgroup_status_item, flags);
247    if data.len() < off + field_size!(btrfs_qgroup_status_item, flags) {
248        return None;
249    }
250    Some(read_le_u64(data, off))
251}
252
253fn parse_info(builder: &mut QgroupEntryBuilder, data: &[u8]) {
254    if data.len() < size_of::<btrfs_qgroup_info_item>() {
255        return;
256    }
257
258    builder.has_info = true;
259    builder.rfer = read_le_u64(data, offset_of!(btrfs_qgroup_info_item, rfer));
260    builder.rfer_cmpr =
261        read_le_u64(data, offset_of!(btrfs_qgroup_info_item, rfer_cmpr));
262    builder.excl = read_le_u64(data, offset_of!(btrfs_qgroup_info_item, excl));
263    builder.excl_cmpr =
264        read_le_u64(data, offset_of!(btrfs_qgroup_info_item, excl_cmpr));
265}
266
267fn parse_limit(builder: &mut QgroupEntryBuilder, data: &[u8]) {
268    let end = offset_of!(btrfs_qgroup_limit_item, max_excl)
269        + field_size!(btrfs_qgroup_limit_item, max_excl);
270    if data.len() < end {
271        return;
272    }
273
274    builder.has_limit = true;
275    builder.limit_flags =
276        read_le_u64(data, offset_of!(btrfs_qgroup_limit_item, flags));
277    builder.max_rfer =
278        read_le_u64(data, offset_of!(btrfs_qgroup_limit_item, max_rfer));
279    builder.max_excl =
280        read_le_u64(data, offset_of!(btrfs_qgroup_limit_item, max_excl));
281}
282
283/// Create a new qgroup with the given `qgroupid` on the filesystem referred
284/// to by `fd`.
285///
286/// `qgroupid` is the packed form: `(level << 48) | subvolid`.
287pub fn qgroup_create(fd: BorrowedFd, qgroupid: u64) -> nix::Result<()> {
288    let mut args: btrfs_ioctl_qgroup_create_args = unsafe { mem::zeroed() };
289    args.create = 1;
290    args.qgroupid = qgroupid;
291    // SAFETY: args is fully initialised above and lives for the duration of
292    // the ioctl call.
293    unsafe { btrfs_ioc_qgroup_create(fd.as_raw_fd(), &args) }?;
294    Ok(())
295}
296
297/// Destroy the qgroup with the given `qgroupid` on the filesystem referred
298/// to by `fd`.
299pub fn qgroup_destroy(fd: BorrowedFd, qgroupid: u64) -> nix::Result<()> {
300    let mut args: btrfs_ioctl_qgroup_create_args = unsafe { mem::zeroed() };
301    args.create = 0;
302    args.qgroupid = qgroupid;
303    // SAFETY: args is fully initialised above and lives for the duration of
304    // the ioctl call.
305    unsafe { btrfs_ioc_qgroup_create(fd.as_raw_fd(), &args) }?;
306    Ok(())
307}
308
309/// Assign qgroup `src` as a member of qgroup `dst` (i.e. `src` becomes a
310/// child of `dst`).
311///
312/// Returns `true` if the kernel indicates that a quota rescan is now needed
313/// (the ioctl returned a positive value).
314///
315/// Errors: ENOENT if either qgroup does not exist.  EEXIST if the
316/// relationship already exists.
317pub fn qgroup_assign(fd: BorrowedFd, src: u64, dst: u64) -> nix::Result<bool> {
318    let mut args: btrfs_ioctl_qgroup_assign_args = unsafe { mem::zeroed() };
319    args.assign = 1;
320    args.src = src;
321    args.dst = dst;
322    // SAFETY: args is fully initialised above and lives for the duration of
323    // the ioctl call.
324    let ret = unsafe { btrfs_ioc_qgroup_assign(fd.as_raw_fd(), &args) }?;
325    Ok(ret > 0)
326}
327
328/// Remove the child-parent relationship between qgroups `src` and `dst`.
329///
330/// Returns `true` if the kernel indicates that a quota rescan is now needed.
331///
332/// Errors: ENOENT if either qgroup does not exist or the relationship
333/// is not present.
334pub fn qgroup_remove(fd: BorrowedFd, src: u64, dst: u64) -> nix::Result<bool> {
335    let mut args: btrfs_ioctl_qgroup_assign_args = unsafe { mem::zeroed() };
336    args.assign = 0;
337    args.src = src;
338    args.dst = dst;
339    // SAFETY: args is fully initialised above and lives for the duration of
340    // the ioctl call.
341    let ret = unsafe { btrfs_ioc_qgroup_assign(fd.as_raw_fd(), &args) }?;
342    Ok(ret > 0)
343}
344
345/// Set usage limits on a qgroup.
346///
347/// Pass `QgroupLimitFlags::MAX_RFER` in `flags` to enforce `max_rfer`, and/or
348/// `QgroupLimitFlags::MAX_EXCL` to enforce `max_excl`.  Clear a limit by
349/// omitting the corresponding flag.
350pub fn qgroup_limit(
351    fd: BorrowedFd,
352    qgroupid: u64,
353    flags: QgroupLimitFlags,
354    max_rfer: u64,
355    max_excl: u64,
356) -> nix::Result<()> {
357    let lim = btrfs_qgroup_limit {
358        flags: flags.bits(),
359        max_referenced: max_rfer,
360        max_exclusive: max_excl,
361        rsv_referenced: 0,
362        rsv_exclusive: 0,
363    };
364    let mut args: btrfs_ioctl_qgroup_limit_args = unsafe { mem::zeroed() };
365    args.qgroupid = qgroupid;
366    args.lim = lim;
367    // SAFETY: args is fully initialised above and lives for the duration of
368    // the ioctl call.  The ioctl number is #43 (_IOR direction in the kernel
369    // header), which reads args from userspace.
370    unsafe { btrfs_ioc_qgroup_limit(fd.as_raw_fd(), &mut args) }?;
371    Ok(())
372}
373
374/// List all qgroups and overall quota status for the filesystem referred to
375/// by `fd`.
376///
377/// Returns `Ok(QgroupList { status_flags: empty, qgroups: [] })` when quota
378/// accounting is not enabled (`ENOENT` from the kernel).
379pub fn qgroup_list(fd: BorrowedFd) -> nix::Result<QgroupList> {
380    // Build a map of qgroupid → builder as we walk the quota tree.
381    let mut builders: HashMap<u64, QgroupEntryBuilder> = HashMap::new();
382    let mut status_flags = QgroupStatusFlags::empty();
383
384    // Scan the quota tree for STATUS / INFO / LIMIT / RELATION items in one pass.
385    let quota_key = SearchKey {
386        tree_id: BTRFS_QUOTA_TREE_OBJECTID as u64,
387        min_objectid: 0,
388        max_objectid: u64::MAX,
389        min_type: BTRFS_QGROUP_STATUS_KEY,
390        max_type: BTRFS_QGROUP_RELATION_KEY,
391        min_offset: 0,
392        max_offset: u64::MAX,
393        min_transid: 0,
394        max_transid: u64::MAX,
395    };
396
397    let scan_result = tree_search(fd, quota_key, |hdr, data| {
398        match hdr.item_type {
399            t if t == BTRFS_QGROUP_STATUS_KEY => {
400                if let Some(raw) = parse_status_flags(data) {
401                    status_flags = QgroupStatusFlags::from_bits_truncate(raw);
402                }
403            }
404            t if t == BTRFS_QGROUP_INFO_KEY => {
405                // offset = qgroupid
406                let entry = builders.entry(hdr.offset).or_default();
407                parse_info(entry, data);
408            }
409            t if t == BTRFS_QGROUP_LIMIT_KEY => {
410                // offset = qgroupid
411                let entry = builders.entry(hdr.offset).or_default();
412                parse_limit(entry, data);
413            }
414            t if t == BTRFS_QGROUP_RELATION_KEY => {
415                // The kernel stores two entries per relation:
416                //   (child, RELATION_KEY, parent)
417                //   (parent, RELATION_KEY, child)
418                // Only process the canonical form where objectid > offset,
419                // i.e. parent > child.
420                if hdr.objectid > hdr.offset {
421                    let parent = hdr.objectid;
422                    let child = hdr.offset;
423                    builders.entry(child).or_default().parents.push(parent);
424                    builders.entry(parent).or_default().children.push(child);
425                }
426            }
427            _ => {}
428        }
429        Ok(())
430    });
431
432    match scan_result {
433        Err(Errno::ENOENT) => {
434            // Quota tree does not exist — quotas are disabled.
435            return Ok(QgroupList {
436                status_flags: QgroupStatusFlags::empty(),
437                qgroups: Vec::new(),
438            });
439        }
440        Err(e) => return Err(e),
441        Ok(()) => {}
442    }
443
444    // Collect existing subvolume IDs so we can mark stale level-0 qgroups.
445    let existing_subvol_ids = collect_subvol_ids(fd)?;
446
447    // Convert builders to QgroupInfo, computing stale flag for level-0 groups.
448    let mut qgroups: Vec<QgroupInfo> = builders
449        .into_iter()
450        .map(|(qgroupid, builder)| {
451            let stale = if qgroupid_level(qgroupid) == 0 {
452                !existing_subvol_ids.contains(&qgroupid_subvolid(qgroupid))
453            } else {
454                false
455            };
456            builder.build(qgroupid, stale)
457        })
458        .collect();
459
460    qgroups.sort_by_key(|q| q.qgroupid);
461
462    Ok(QgroupList {
463        status_flags,
464        qgroups,
465    })
466}
467
468/// Collect the set of all existing subvolume IDs by scanning
469/// `ROOT_ITEM_KEY` entries in the root tree.
470fn collect_subvol_ids(fd: BorrowedFd) -> nix::Result<HashSet<u64>> {
471    let mut ids: HashSet<u64> = HashSet::new();
472
473    // BTRFS_LAST_FREE_OBJECTID binds as i32 = -256; cast to u64 gives
474    // 0xFFFFFFFF_FFFFFF00 as expected.
475    let key = SearchKey::for_objectid_range(
476        BTRFS_ROOT_TREE_OBJECTID as u64,
477        BTRFS_ROOT_ITEM_KEY,
478        BTRFS_FIRST_FREE_OBJECTID as u64,
479        BTRFS_LAST_FREE_OBJECTID as u64,
480    );
481
482    tree_search(fd, key, |hdr, _data| {
483        ids.insert(hdr.objectid);
484        Ok(())
485    })?;
486
487    Ok(ids)
488}
489
490/// Destroy all "stale" level-0 qgroups — those whose corresponding subvolume
491/// no longer exists.
492///
493/// In simple-quota mode (`SIMPLE_MODE` flag set), stale qgroups with non-zero
494/// `rfer` or `excl` are retained because they hold accounting information for
495/// dropped subvolumes.
496///
497/// Returns the number of qgroups successfully destroyed.
498pub fn qgroup_clear_stale(fd: BorrowedFd) -> nix::Result<usize> {
499    let list = qgroup_list(fd)?;
500    let simple_mode =
501        list.status_flags.contains(QgroupStatusFlags::SIMPLE_MODE);
502
503    let mut count = 0usize;
504
505    for qg in &list.qgroups {
506        // Only process level-0 stale qgroups.
507        if qgroupid_level(qg.qgroupid) != 0 || !qg.stale {
508            continue;
509        }
510
511        // In simple-quota mode, keep stale qgroups that still have usage data.
512        if simple_mode && (qg.rfer != 0 || qg.excl != 0) {
513            continue;
514        }
515
516        if qgroup_destroy(fd, qg.qgroupid).is_ok() {
517            count += 1;
518        }
519    }
520
521    Ok(count)
522}
523
524#[cfg(test)]
525mod tests {
526    use super::*;
527
528    #[test]
529    fn qgroupid_level_zero() {
530        assert_eq!(qgroupid_level(5), 0);
531        assert_eq!(qgroupid_level(256), 0);
532    }
533
534    #[test]
535    fn qgroupid_level_nonzero() {
536        let id = (1u64 << 48) | 100;
537        assert_eq!(qgroupid_level(id), 1);
538
539        let id = (3u64 << 48) | 42;
540        assert_eq!(qgroupid_level(id), 3);
541    }
542
543    #[test]
544    fn qgroupid_subvolid_extracts_lower_48_bits() {
545        assert_eq!(qgroupid_subvolid(256), 256);
546        assert_eq!(qgroupid_subvolid((1u64 << 48) | 100), 100);
547        assert_eq!(qgroupid_subvolid((2u64 << 48) | 0), 0);
548    }
549
550    #[test]
551    fn qgroupid_roundtrip() {
552        let level: u64 = 2;
553        let subvolid: u64 = 999;
554        let packed = (level << 48) | subvolid;
555        assert_eq!(qgroupid_level(packed), level as u16);
556        assert_eq!(qgroupid_subvolid(packed), subvolid);
557    }
558}