Skip to main content

btrfs_uapi/
chunk.rs

1//! # Per-device physical allocation data from the chunk tree
2//!
3//! Walks the chunk tree to determine how many bytes each device has allocated,
4//! broken down by block-group profile flags.  This is the data source for the
5//! per-device breakdown in `btrfs filesystem usage`.
6//!
7//! Also exposes the full per-stripe chunk list used by `inspect-internal
8//! list-chunks`, including the bytes-used figure from the extent tree.
9//!
10//! Requires `CAP_SYS_ADMIN`.
11
12use crate::{
13    field_size,
14    raw::{
15        BTRFS_BLOCK_GROUP_ITEM_KEY, BTRFS_CHUNK_ITEM_KEY, BTRFS_CHUNK_TREE_OBJECTID,
16        BTRFS_EXTENT_TREE_OBJECTID, BTRFS_FIRST_CHUNK_TREE_OBJECTID, btrfs_block_group_item,
17        btrfs_chunk, btrfs_stripe,
18    },
19    space::BlockGroupFlags,
20    tree_search::{SearchKey, tree_search},
21};
22use std::os::unix::io::BorrowedFd;
23
24/// Physical allocation of one block-group profile on one device, as read
25/// from the chunk tree.
26///
27/// `bytes` is the sum of `stripe_len` over all chunk stripes that land on
28/// `devid` and share the same `flags`.  This is the physical space the device
29/// contributes to that profile, not the logical (usable) space.
30#[derive(Debug, Clone, PartialEq, Eq)]
31pub struct DeviceAllocation {
32    /// btrfs device ID.
33    pub devid: u64,
34    /// Combined block-group type and profile flags, e.g.
35    /// `BlockGroupFlags::DATA | BlockGroupFlags::RAID1`.
36    pub flags: BlockGroupFlags,
37    /// Physical bytes allocated on this device for chunks with these flags.
38    pub bytes: u64,
39}
40
41const CHUNK_LENGTH_OFF: usize = std::mem::offset_of!(btrfs_chunk, length);
42const CHUNK_STRIPE_LEN_OFF: usize = std::mem::offset_of!(btrfs_chunk, stripe_len);
43const CHUNK_TYPE_OFF: usize = std::mem::offset_of!(btrfs_chunk, type_);
44const CHUNK_NUM_STRIPES_OFF: usize = std::mem::offset_of!(btrfs_chunk, num_stripes);
45const CHUNK_FIRST_STRIPE_OFF: usize = std::mem::offset_of!(btrfs_chunk, stripe);
46
47const STRIPE_SIZE: usize = std::mem::size_of::<btrfs_stripe>();
48const STRIPE_DEVID_OFF: usize = std::mem::offset_of!(btrfs_stripe, devid);
49const STRIPE_OFFSET_OFF: usize = std::mem::offset_of!(btrfs_stripe, offset);
50
51// Minimum item length: the btrfs_chunk struct with exactly one stripe.
52const CHUNK_MIN_LEN: usize = CHUNK_FIRST_STRIPE_OFF + STRIPE_SIZE; // 80
53
54/// One physical chunk stripe as seen in the chunk tree, with usage data from
55/// the extent tree.
56///
57/// For striped profiles (RAID0, RAID10, …) each logical chunk maps to
58/// multiple stripes on different devices; each stripe yields one `ChunkEntry`.
59/// For non-striped profiles (single, DUP) there is one `ChunkEntry` per chunk.
60#[derive(Debug, Clone, PartialEq, Eq)]
61pub struct ChunkEntry {
62    /// btrfs device ID that holds this stripe.
63    pub devid: u64,
64    /// Physical byte offset of this stripe on the device.
65    pub physical_start: u64,
66    /// Logical byte offset of the chunk within the filesystem address space.
67    pub logical_start: u64,
68    /// Logical length of the chunk in bytes (shared across all stripes of
69    /// the same chunk).
70    pub length: u64,
71    /// Combined block-group type and profile flags.
72    pub flags: BlockGroupFlags,
73    /// Bytes currently used within this chunk, as reported by the extent tree.
74    /// `0` if the block-group item could not be read.
75    pub used: u64,
76}
77
78/// Walk the chunk tree of the filesystem referred to by `fd` and return the
79/// physical allocation of each block-group profile on each device.
80///
81/// The result may contain multiple entries with the same `devid` when a
82/// device participates in chunks of different profiles (e.g. both
83/// `DATA|SINGLE` and `METADATA|DUP`).  Entries with the same `(devid, flags)`
84/// pair are merged — there will be at most one entry per unique pair.
85///
86/// Internally, each `BTRFS_CHUNK_ITEM_KEY` payload is a packed `btrfs_chunk`
87/// struct followed by `num_stripes - 1` additional `btrfs_stripe` structs.
88/// The `stripe_len` field of each stripe is accumulated per `(devid, flags)`
89/// to produce the physical byte counts in the returned list.
90pub fn device_chunk_allocations(fd: BorrowedFd) -> nix::Result<Vec<DeviceAllocation>> {
91    let mut allocs: Vec<DeviceAllocation> = Vec::new();
92
93    tree_search(
94        fd,
95        SearchKey::for_type(
96            BTRFS_CHUNK_TREE_OBJECTID as u64,
97            BTRFS_CHUNK_ITEM_KEY as u32,
98        ),
99        |_hdr, data| {
100            if let Some((stripe_len, flags, stripes)) = parse_chunk(data) {
101                for devid in stripes {
102                    accumulate(&mut allocs, devid, flags, stripe_len);
103                }
104            }
105            Ok(())
106        },
107    )?;
108
109    Ok(allocs)
110}
111
112/// Walk the chunk tree and return one [`ChunkEntry`] per stripe, including
113/// bytes-used from the extent tree.
114///
115/// The returned list is in chunk-tree order (ascending logical offset); call
116/// sites are responsible for any further sorting.  For each logical chunk the
117/// `used` field is populated by a single extent-tree lookup; if that lookup
118/// fails the field is set to `0` rather than propagating an error.
119///
120/// Requires `CAP_SYS_ADMIN`.
121pub fn chunk_list(fd: BorrowedFd) -> nix::Result<Vec<ChunkEntry>> {
122    let mut entries: Vec<ChunkEntry> = Vec::new();
123
124    tree_search(
125        fd,
126        SearchKey::for_objectid_range(
127            BTRFS_CHUNK_TREE_OBJECTID as u64,
128            BTRFS_CHUNK_ITEM_KEY as u32,
129            BTRFS_FIRST_CHUNK_TREE_OBJECTID as u64,
130            BTRFS_FIRST_CHUNK_TREE_OBJECTID as u64,
131        ),
132        |hdr, data| {
133            if let Some(stripes) = parse_chunk_stripes(data) {
134                let logical_start = hdr.offset;
135                let length = read_le_u64(data, CHUNK_LENGTH_OFF);
136                let type_bits = read_le_u64(data, CHUNK_TYPE_OFF);
137                let flags = BlockGroupFlags::from_bits_truncate(type_bits);
138                let used = block_group_used(fd, logical_start).unwrap_or(0);
139                for (devid, physical_start) in stripes {
140                    entries.push(ChunkEntry {
141                        devid,
142                        physical_start,
143                        logical_start,
144                        length,
145                        flags,
146                        used,
147                    });
148                }
149            }
150            Ok(())
151        },
152    )?;
153
154    Ok(entries)
155}
156
157/// Look up the bytes-used counter for the block group at `logical_start` by
158/// searching for `BTRFS_BLOCK_GROUP_ITEM_KEY` in the extent tree.
159///
160/// Returns `None` if the block group item is not found or cannot be read.
161fn block_group_used(fd: BorrowedFd, logical_start: u64) -> Option<u64> {
162    let mut used: Option<u64> = None;
163    tree_search(
164        fd,
165        SearchKey {
166            tree_id: BTRFS_EXTENT_TREE_OBJECTID as u64,
167            min_objectid: logical_start,
168            max_objectid: logical_start,
169            min_type: BTRFS_BLOCK_GROUP_ITEM_KEY,
170            max_type: BTRFS_BLOCK_GROUP_ITEM_KEY,
171            min_offset: 0,
172            max_offset: u64::MAX,
173            min_transid: 0,
174            max_transid: u64::MAX,
175        },
176        |_hdr, data| {
177            let used_off = std::mem::offset_of!(btrfs_block_group_item, used);
178            if data.len() >= used_off + field_size!(btrfs_block_group_item, used) {
179                used = Some(read_le_u64(data, used_off));
180            }
181            Ok(())
182        },
183    )
184    .ok()?;
185    used
186}
187
188/// Parse a raw chunk item payload.
189///
190/// Returns `(stripe_len, flags, devids)` on success, or `None` if the buffer
191/// is too small to be a valid chunk item.
192fn parse_chunk(data: &[u8]) -> Option<(u64, BlockGroupFlags, impl Iterator<Item = u64> + '_)> {
193    if data.len() < CHUNK_MIN_LEN {
194        return None;
195    }
196
197    let stripe_len = read_le_u64(data, CHUNK_STRIPE_LEN_OFF);
198    let type_bits = read_le_u64(data, CHUNK_TYPE_OFF);
199    let num_stripes = read_le_u16(data, CHUNK_NUM_STRIPES_OFF) as usize;
200    let _length = read_le_u64(data, CHUNK_LENGTH_OFF);
201
202    // Sanity-check: the item must be large enough to hold all stripes.
203    let expected_len = CHUNK_FIRST_STRIPE_OFF + num_stripes * STRIPE_SIZE;
204    if data.len() < expected_len || num_stripes == 0 {
205        return None;
206    }
207
208    let flags = BlockGroupFlags::from_bits_truncate(type_bits);
209
210    let devids = (0..num_stripes).map(move |i| {
211        let stripe_off = CHUNK_FIRST_STRIPE_OFF + i * STRIPE_SIZE;
212        read_le_u64(data, stripe_off + STRIPE_DEVID_OFF)
213    });
214
215    Some((stripe_len, flags, devids))
216}
217
218/// Parse a raw chunk item payload and return an iterator of `(devid,
219/// physical_start)` pairs for each stripe.
220///
221/// Returns `None` if the buffer is too small to be a valid chunk item.
222fn parse_chunk_stripes(data: &[u8]) -> Option<impl Iterator<Item = (u64, u64)> + '_> {
223    if data.len() < CHUNK_MIN_LEN {
224        return None;
225    }
226
227    let num_stripes = read_le_u16(data, CHUNK_NUM_STRIPES_OFF) as usize;
228    let expected_len = CHUNK_FIRST_STRIPE_OFF + num_stripes * STRIPE_SIZE;
229    if data.len() < expected_len || num_stripes == 0 {
230        return None;
231    }
232
233    let iter = (0..num_stripes).map(move |i| {
234        let stripe_off = CHUNK_FIRST_STRIPE_OFF + i * STRIPE_SIZE;
235        let devid = read_le_u64(data, stripe_off + STRIPE_DEVID_OFF);
236        let physical_start = read_le_u64(data, stripe_off + STRIPE_OFFSET_OFF);
237        (devid, physical_start)
238    });
239
240    Some(iter)
241}
242
243/// Add `stripe_len` bytes to the `(devid, flags)` entry, creating it if
244/// it does not yet exist.
245fn accumulate(allocs: &mut Vec<DeviceAllocation>, devid: u64, flags: BlockGroupFlags, bytes: u64) {
246    if let Some(entry) = allocs
247        .iter_mut()
248        .find(|a| a.devid == devid && a.flags == flags)
249    {
250        entry.bytes += bytes;
251    } else {
252        allocs.push(DeviceAllocation {
253            devid,
254            flags,
255            bytes,
256        });
257    }
258}
259
260fn read_le_u64(buf: &[u8], off: usize) -> u64 {
261    u64::from_le_bytes(buf[off..off + 8].try_into().unwrap())
262}
263
264fn read_le_u16(buf: &[u8], off: usize) -> u16 {
265    u16::from_le_bytes(buf[off..off + 2].try_into().unwrap())
266}
267
268#[cfg(test)]
269mod tests {
270    use super::*;
271
272    /// Build a minimal valid single-stripe chunk item buffer.
273    fn build_chunk_buf(
274        length: u64,
275        stripe_len: u64,
276        type_bits: u64,
277        num_stripes: u16,
278        stripes: &[(u64, u64)], // (devid, offset) per stripe
279    ) -> Vec<u8> {
280        let total = CHUNK_FIRST_STRIPE_OFF + stripes.len() * STRIPE_SIZE;
281        let mut buf = vec![0u8; total];
282        buf[CHUNK_LENGTH_OFF..CHUNK_LENGTH_OFF + 8].copy_from_slice(&length.to_le_bytes());
283        buf[CHUNK_STRIPE_LEN_OFF..CHUNK_STRIPE_LEN_OFF + 8]
284            .copy_from_slice(&stripe_len.to_le_bytes());
285        buf[CHUNK_TYPE_OFF..CHUNK_TYPE_OFF + 8].copy_from_slice(&type_bits.to_le_bytes());
286        buf[CHUNK_NUM_STRIPES_OFF..CHUNK_NUM_STRIPES_OFF + 2]
287            .copy_from_slice(&num_stripes.to_le_bytes());
288        for (i, &(devid, offset)) in stripes.iter().enumerate() {
289            let s = CHUNK_FIRST_STRIPE_OFF + i * STRIPE_SIZE;
290            buf[s + STRIPE_DEVID_OFF..s + STRIPE_DEVID_OFF + 8]
291                .copy_from_slice(&devid.to_le_bytes());
292            buf[s + STRIPE_OFFSET_OFF..s + STRIPE_OFFSET_OFF + 8]
293                .copy_from_slice(&offset.to_le_bytes());
294        }
295        buf
296    }
297
298    // --- read_le_u64 / read_le_u16 ---
299
300    #[test]
301    fn read_le_u64_basic() {
302        let buf = 0x0102030405060708u64.to_le_bytes();
303        assert_eq!(read_le_u64(&buf, 0), 0x0102030405060708);
304    }
305
306    #[test]
307    fn read_le_u16_basic() {
308        let buf = 0x0102u16.to_le_bytes();
309        assert_eq!(read_le_u16(&buf, 0), 0x0102);
310    }
311
312    // --- parse_chunk ---
313
314    #[test]
315    fn parse_chunk_single_stripe() {
316        let data_flags = BlockGroupFlags::DATA.bits();
317        let buf = build_chunk_buf(1024 * 1024, 65536, data_flags, 1, &[(1, 0)]);
318        let (stripe_len, flags, devids) = parse_chunk(&buf).unwrap();
319        assert_eq!(stripe_len, 65536);
320        assert_eq!(flags, BlockGroupFlags::DATA);
321        let devids: Vec<u64> = devids.collect();
322        assert_eq!(devids, vec![1]);
323    }
324
325    #[test]
326    fn parse_chunk_two_stripes() {
327        let flags_bits = (BlockGroupFlags::DATA | BlockGroupFlags::RAID1).bits();
328        let buf = build_chunk_buf(1 << 30, 1 << 30, flags_bits, 2, &[(1, 0), (2, 4096)]);
329        let (_, flags, devids) = parse_chunk(&buf).unwrap();
330        assert_eq!(flags, BlockGroupFlags::DATA | BlockGroupFlags::RAID1);
331        let devids: Vec<u64> = devids.collect();
332        assert_eq!(devids, vec![1, 2]);
333    }
334
335    #[test]
336    fn parse_chunk_too_short() {
337        let buf = vec![0u8; CHUNK_MIN_LEN - 1];
338        assert!(parse_chunk(&buf).is_none());
339    }
340
341    #[test]
342    fn parse_chunk_zero_stripes() {
343        // num_stripes = 0 is invalid
344        let buf = build_chunk_buf(1024, 1024, 0, 0, &[]);
345        // buf is only CHUNK_FIRST_STRIPE_OFF bytes, but num_stripes says 0
346        // which means expected_len = CHUNK_FIRST_STRIPE_OFF + 0*STRIPE_SIZE
347        // but the function also checks num_stripes == 0
348        let mut padded = vec![0u8; CHUNK_MIN_LEN];
349        padded[..buf.len().min(CHUNK_MIN_LEN)]
350            .copy_from_slice(&buf[..buf.len().min(CHUNK_MIN_LEN)]);
351        padded[CHUNK_NUM_STRIPES_OFF..CHUNK_NUM_STRIPES_OFF + 2]
352            .copy_from_slice(&0u16.to_le_bytes());
353        assert!(parse_chunk(&padded).is_none());
354    }
355
356    #[test]
357    fn parse_chunk_claims_more_stripes_than_fit() {
358        // num_stripes says 5 but buffer only has room for 1
359        let buf = build_chunk_buf(1024, 1024, 0, 5, &[(1, 0)]);
360        assert!(parse_chunk(&buf).is_none());
361    }
362
363    // --- parse_chunk_stripes ---
364
365    #[test]
366    fn parse_chunk_stripes_returns_devid_and_offset() {
367        let buf = build_chunk_buf(1 << 20, 1 << 20, 0, 2, &[(3, 8192), (7, 16384)]);
368        let stripes: Vec<(u64, u64)> = parse_chunk_stripes(&buf).unwrap().collect();
369        assert_eq!(stripes, vec![(3, 8192), (7, 16384)]);
370    }
371
372    #[test]
373    fn parse_chunk_stripes_too_short() {
374        let buf = vec![0u8; 10];
375        assert!(parse_chunk_stripes(&buf).is_none());
376    }
377
378    // --- accumulate ---
379
380    #[test]
381    fn accumulate_new_entry() {
382        let mut allocs = Vec::new();
383        accumulate(&mut allocs, 1, BlockGroupFlags::DATA, 1000);
384        assert_eq!(allocs.len(), 1);
385        assert_eq!(allocs[0].devid, 1);
386        assert_eq!(allocs[0].bytes, 1000);
387    }
388
389    #[test]
390    fn accumulate_merge_same_devid_flags() {
391        let mut allocs = Vec::new();
392        accumulate(&mut allocs, 1, BlockGroupFlags::DATA, 1000);
393        accumulate(&mut allocs, 1, BlockGroupFlags::DATA, 2000);
394        assert_eq!(allocs.len(), 1);
395        assert_eq!(allocs[0].bytes, 3000);
396    }
397
398    #[test]
399    fn accumulate_separate_different_flags() {
400        let mut allocs = Vec::new();
401        accumulate(&mut allocs, 1, BlockGroupFlags::DATA, 1000);
402        accumulate(&mut allocs, 1, BlockGroupFlags::METADATA, 2000);
403        assert_eq!(allocs.len(), 2);
404    }
405
406    #[test]
407    fn accumulate_separate_different_devids() {
408        let mut allocs = Vec::new();
409        accumulate(&mut allocs, 1, BlockGroupFlags::DATA, 1000);
410        accumulate(&mut allocs, 2, BlockGroupFlags::DATA, 2000);
411        assert_eq!(allocs.len(), 2);
412    }
413}