Skip to main content

btrfs_uapi/
tree_search.rs

1//! # Generic B-tree search: walking any internal btrfs tree via `BTRFS_IOC_TREE_SEARCH`
2//!
3//! The kernel's tree search ioctl lets userspace read any internal btrfs tree
4//! (chunk, root, quota, …) by specifying a key range.  Items are returned in
5//! batches; [`tree_search`] advances the cursor automatically and calls a
6//! closure once per item until the range is exhausted.
7//!
8//! # Byte order
9//!
10//! [`SearchHeader`] fields (objectid, offset, type) are in host byte order:
11//! the kernel fills them in through the ioctl layer.  The `data` slice passed
12//! to the callback contains the raw on-disk item payload, which is
13//! **little-endian**; callers must use `u64::from_le_bytes` and friends when
14//! interpreting it.
15//!
16//! # Ioctl version
17//!
18//! This module uses `BTRFS_IOC_TREE_SEARCH` (v1) with its fixed 3992-byte
19//! result buffer.  This is sufficient for all item types used by this crate;
20//! the v2 variant with a configurable buffer size is not needed.
21
22use crate::raw::{
23    btrfs_ioc_tree_search, btrfs_ioctl_search_args, btrfs_ioctl_search_header,
24    btrfs_ioctl_search_key,
25};
26use std::{
27    mem,
28    os::{fd::AsRawFd, unix::io::BorrowedFd},
29};
30
31/// Parameters specifying which items to return from a tree search.
32///
33/// The kernel searches a 136-bit key space ordered as
34/// `(objectid << 72) | (type << 64) | offset`.
35/// All items whose key falls in the inclusive range `[min_key, max_key]` are
36/// returned.
37///
38/// Build a key for common cases with [`SearchKey::for_type`] or
39/// [`SearchKey::for_objectid_range`].
40///
41/// Tree IDs and item type codes are the `BTRFS_*_OBJECTID` and `BTRFS_*_KEY`
42/// constants from `crate::raw`, cast to `u64` and `u32` respectively at the
43/// call site.
44#[derive(Debug, Clone)]
45pub struct SearchKey {
46    /// Tree to search — use a `BTRFS_*_TREE_OBJECTID` constant from `crate::raw`.
47    pub tree_id: u64,
48    pub min_objectid: u64,
49    pub max_objectid: u64,
50    /// Item type — use a `BTRFS_*_KEY` constant from `crate::raw`.
51    pub min_type: u32,
52    pub max_type: u32,
53    pub min_offset: u64,
54    pub max_offset: u64,
55    /// Filter on the transaction ID of the *metadata block* that holds the
56    /// item, not the transaction that created the item itself.
57    pub min_transid: u64,
58    pub max_transid: u64,
59}
60
61impl SearchKey {
62    /// Return all items of `item_type` in `tree_id`, across every objectid
63    /// and offset.
64    pub fn for_type(tree_id: u64, item_type: u32) -> Self {
65        Self {
66            tree_id,
67            min_objectid: 0,
68            max_objectid: u64::MAX,
69            min_type: item_type,
70            max_type: item_type,
71            min_offset: 0,
72            max_offset: u64::MAX,
73            min_transid: 0,
74            max_transid: u64::MAX,
75        }
76    }
77
78    /// Return all items of `item_type` in `tree_id` whose objectid falls in
79    /// `[min_objectid, max_objectid]`.
80    pub fn for_objectid_range(
81        tree_id: u64,
82        item_type: u32,
83        min_objectid: u64,
84        max_objectid: u64,
85    ) -> Self {
86        Self {
87            min_objectid,
88            max_objectid,
89            ..Self::for_type(tree_id, item_type)
90        }
91    }
92}
93
94/// Metadata returned for each item found by [`tree_search`].
95///
96/// The header fields are in host byte order (the kernel fills them in through
97/// the ioctl layer).  The accompanying `data` slice passed to the callback is
98/// the raw on-disk item payload and is in **little-endian** byte order.
99#[derive(Debug, Clone, Copy)]
100pub struct SearchHeader {
101    pub transid: u64,
102    pub objectid: u64,
103    pub offset: u64,
104    /// Item type (the `type` field of the btrfs key).
105    pub item_type: u32,
106    /// Length in bytes of the item's data payload.
107    pub len: u32,
108}
109
110/// Number of items to request per ioctl call.
111const ITEMS_PER_BATCH: u32 = 4096;
112
113/// Size of `btrfs_ioctl_search_header` as a compile-time constant.
114const SEARCH_HEADER_SIZE: usize = mem::size_of::<btrfs_ioctl_search_header>();
115
116/// Walk every item in the tree that falls within `key`, calling `f` once for
117/// each one.
118///
119/// `f` receives:
120/// * a reference to the item's [`SearchHeader`] (objectid, offset, type, …)
121/// * a byte slice of the item's raw on-disk data payload (little-endian)
122///
123/// The search stops early and the error is returned if:
124/// * `f` returns `Err(_)`
125/// * the underlying `BTRFS_IOC_TREE_SEARCH` ioctl fails
126///
127/// Returns `Ok(())` when the entire requested range has been scanned.
128///
129/// # Privileges
130///
131/// Most trees require `CAP_SYS_ADMIN`.  The subvolume tree of the inode
132/// belonging to `fd` can be searched without elevated privileges by setting
133/// `key.tree_id = 0`.
134pub fn tree_search(
135    fd: BorrowedFd,
136    key: SearchKey,
137    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
138) -> nix::Result<()> {
139    let mut args: btrfs_ioctl_search_args = unsafe { mem::zeroed() };
140
141    fill_search_key(&mut args.key, &key);
142
143    loop {
144        args.key.nr_items = ITEMS_PER_BATCH;
145
146        unsafe { btrfs_ioc_tree_search(fd.as_raw_fd(), &mut args) }?;
147
148        let nr = args.key.nr_items;
149        if nr == 0 {
150            break;
151        }
152
153        // Walk the result buffer.  We use raw pointer reads to avoid borrow
154        // checker conflicts: args.buf (read-only after the ioctl) and
155        // args.key (mutated below for cursor advancement) are separate fields,
156        // but a Rust reference to one would prevent mutation of the other.
157        //
158        // SAFETY:
159        // * The ioctl has returned successfully, so args.buf contains nr valid
160        //   (header, data) pairs totalling at most args.buf.len() bytes.
161        // * buf_base is derived from args.buf which lives for the entire loop
162        //   body; it is not invalidated until args is dropped.
163        // * All raw reads are bounds-checked before dereferencing.
164        // * The `data` slices passed to `f` do not outlive this loop
165        //   iteration — `f` takes `&[u8]`, not `&'static [u8]`.
166        let buf_base: *const u8 = args.buf.as_ptr().cast();
167        let buf_cap = args.buf.len();
168
169        let mut off = 0usize;
170        let mut last = SearchHeader {
171            transid: 0,
172            objectid: 0,
173            offset: 0,
174            item_type: 0,
175            len: 0,
176        };
177
178        for _ in 0..nr {
179            if off + SEARCH_HEADER_SIZE > buf_cap {
180                return Err(nix::errno::Errno::EOVERFLOW);
181            }
182            let raw_hdr: btrfs_ioctl_search_header =
183                unsafe { (buf_base.add(off) as *const btrfs_ioctl_search_header).read_unaligned() };
184            let hdr = SearchHeader {
185                transid: raw_hdr.transid,
186                objectid: raw_hdr.objectid,
187                offset: raw_hdr.offset,
188                item_type: raw_hdr.type_,
189                len: raw_hdr.len,
190            };
191            off += SEARCH_HEADER_SIZE;
192
193            let data_len = hdr.len as usize;
194            if off + data_len > buf_cap {
195                return Err(nix::errno::Errno::EOVERFLOW);
196            }
197            let data: &[u8] = unsafe { std::slice::from_raw_parts(buf_base.add(off), data_len) };
198            off += data_len;
199
200            f(&hdr, data)?;
201            last = hdr;
202        }
203
204        if !advance_cursor(&mut args.key, &last) {
205            break;
206        }
207    }
208
209    Ok(())
210}
211
212fn fill_search_key(sk: &mut btrfs_ioctl_search_key, key: &SearchKey) {
213    sk.tree_id = key.tree_id;
214    sk.min_objectid = key.min_objectid;
215    sk.max_objectid = key.max_objectid;
216    sk.min_type = key.min_type;
217    sk.max_type = key.max_type;
218    sk.min_offset = key.min_offset;
219    sk.max_offset = key.max_offset;
220    sk.min_transid = key.min_transid;
221    sk.max_transid = key.max_transid;
222}
223
224/// Advance the search cursor past `last` so the next batch begins from the
225/// item immediately after it in the 136-bit key space
226/// `(objectid << 72) | (type << 64) | offset`.
227///
228/// The kernel interprets `(min_objectid, min_type, min_offset)` as a compound
229/// tuple key, so all three fields must be updated together to point past the
230/// last returned item.  Advancing only `min_offset` while leaving
231/// `min_objectid` at its original value would cause items from lower objectids
232/// that were already returned to satisfy the new minimum and be yielded again.
233///
234/// Returns `false` when the objectid also overflows, meaning the full key
235/// space has been exhausted.
236fn advance_cursor(sk: &mut btrfs_ioctl_search_key, last: &SearchHeader) -> bool {
237    let (new_offset, offset_overflow) = last.offset.overflowing_add(1);
238    if !offset_overflow {
239        sk.min_objectid = last.objectid;
240        sk.min_type = last.item_type;
241        sk.min_offset = new_offset;
242        return true;
243    }
244
245    sk.min_offset = 0;
246    let (new_type, type_overflow) = last.item_type.overflowing_add(1);
247    if !type_overflow {
248        sk.min_objectid = last.objectid;
249        sk.min_type = new_type;
250        return true;
251    }
252
253    sk.min_type = 0;
254    let (new_oid, oid_overflow) = last.objectid.overflowing_add(1);
255    if oid_overflow {
256        return false;
257    }
258    sk.min_objectid = new_oid;
259    true
260}
261
262#[cfg(test)]
263mod tests {
264    use super::*;
265
266    fn header(objectid: u64, item_type: u32, offset: u64) -> SearchHeader {
267        SearchHeader {
268            transid: 0,
269            objectid,
270            offset,
271            item_type,
272            len: 0,
273        }
274    }
275
276    fn zeroed_search_key() -> btrfs_ioctl_search_key {
277        unsafe { mem::zeroed() }
278    }
279
280    // --- SearchKey constructors ---
281
282    #[test]
283    fn for_type_covers_all_objectids_and_offsets() {
284        let sk = SearchKey::for_type(5, 132);
285        assert_eq!(sk.tree_id, 5);
286        assert_eq!(sk.min_objectid, 0);
287        assert_eq!(sk.max_objectid, u64::MAX);
288        assert_eq!(sk.min_type, 132);
289        assert_eq!(sk.max_type, 132);
290        assert_eq!(sk.min_offset, 0);
291        assert_eq!(sk.max_offset, u64::MAX);
292        assert_eq!(sk.min_transid, 0);
293        assert_eq!(sk.max_transid, u64::MAX);
294    }
295
296    #[test]
297    fn for_objectid_range_restricts_objectids() {
298        let sk = SearchKey::for_objectid_range(1, 84, 100, 200);
299        assert_eq!(sk.tree_id, 1);
300        assert_eq!(sk.min_objectid, 100);
301        assert_eq!(sk.max_objectid, 200);
302        assert_eq!(sk.min_type, 84);
303        assert_eq!(sk.max_type, 84);
304        assert_eq!(sk.min_offset, 0);
305        assert_eq!(sk.max_offset, u64::MAX);
306    }
307
308    // --- fill_search_key ---
309
310    #[test]
311    fn fill_search_key_copies_all_fields() {
312        let key = SearchKey {
313            tree_id: 1,
314            min_objectid: 10,
315            max_objectid: 20,
316            min_type: 30,
317            max_type: 40,
318            min_offset: 50,
319            max_offset: 60,
320            min_transid: 70,
321            max_transid: 80,
322        };
323        let mut sk = zeroed_search_key();
324        fill_search_key(&mut sk, &key);
325        assert_eq!(sk.tree_id, 1);
326        assert_eq!(sk.min_objectid, 10);
327        assert_eq!(sk.max_objectid, 20);
328        assert_eq!(sk.min_type, 30);
329        assert_eq!(sk.max_type, 40);
330        assert_eq!(sk.min_offset, 50);
331        assert_eq!(sk.max_offset, 60);
332        assert_eq!(sk.min_transid, 70);
333        assert_eq!(sk.max_transid, 80);
334    }
335
336    // --- advance_cursor: normal case ---
337
338    #[test]
339    fn advance_increments_offset() {
340        let mut sk = zeroed_search_key();
341        let last = header(256, 132, 100);
342        assert!(advance_cursor(&mut sk, &last));
343        assert_eq!(sk.min_objectid, 256);
344        assert_eq!(sk.min_type, 132);
345        assert_eq!(sk.min_offset, 101);
346    }
347
348    #[test]
349    fn advance_tracks_objectid_from_last_item() {
350        // This is the bug that was fixed: the cursor must track the last
351        // item's objectid, not leave min_objectid at its original value.
352        let mut sk = zeroed_search_key();
353        sk.min_objectid = 100; // original search started at 100
354        let last = header(300, 132, 50); // batch ended at objectid 300
355        assert!(advance_cursor(&mut sk, &last));
356        assert_eq!(sk.min_objectid, 300, "must track last item's objectid");
357        assert_eq!(sk.min_type, 132);
358        assert_eq!(sk.min_offset, 51);
359    }
360
361    #[test]
362    fn advance_tracks_type_from_last_item() {
363        let mut sk = zeroed_search_key();
364        let last = header(256, 180, 42);
365        assert!(advance_cursor(&mut sk, &last));
366        assert_eq!(sk.min_objectid, 256);
367        assert_eq!(sk.min_type, 180);
368        assert_eq!(sk.min_offset, 43);
369    }
370
371    // --- advance_cursor: offset overflow ---
372
373    #[test]
374    fn advance_offset_overflow_bumps_type() {
375        let mut sk = zeroed_search_key();
376        let last = header(256, 132, u64::MAX);
377        assert!(advance_cursor(&mut sk, &last));
378        assert_eq!(sk.min_objectid, 256);
379        assert_eq!(sk.min_type, 133);
380        assert_eq!(sk.min_offset, 0);
381    }
382
383    // --- advance_cursor: type overflow ---
384
385    #[test]
386    fn advance_type_overflow_bumps_objectid() {
387        let mut sk = zeroed_search_key();
388        let last = header(256, u32::MAX, u64::MAX);
389        assert!(advance_cursor(&mut sk, &last));
390        assert_eq!(sk.min_objectid, 257);
391        assert_eq!(sk.min_type, 0);
392        assert_eq!(sk.min_offset, 0);
393    }
394
395    // --- advance_cursor: full keyspace exhaustion ---
396
397    #[test]
398    fn advance_all_overflow_returns_false() {
399        let mut sk = zeroed_search_key();
400        let last = header(u64::MAX, u32::MAX, u64::MAX);
401        assert!(!advance_cursor(&mut sk, &last));
402    }
403
404    // --- advance_cursor: edge cases ---
405
406    #[test]
407    fn advance_zero_key() {
408        let mut sk = zeroed_search_key();
409        let last = header(0, 0, 0);
410        assert!(advance_cursor(&mut sk, &last));
411        assert_eq!(sk.min_objectid, 0);
412        assert_eq!(sk.min_type, 0);
413        assert_eq!(sk.min_offset, 1);
414    }
415
416    #[test]
417    fn advance_objectid_max_type_zero_offset_max() {
418        // offset overflows, type bumps to 1
419        let mut sk = zeroed_search_key();
420        let last = header(u64::MAX, 0, u64::MAX);
421        assert!(advance_cursor(&mut sk, &last));
422        assert_eq!(sk.min_objectid, u64::MAX);
423        assert_eq!(sk.min_type, 1);
424        assert_eq!(sk.min_offset, 0);
425    }
426
427    #[test]
428    fn advance_preserves_unrelated_search_key_fields() {
429        let mut sk = zeroed_search_key();
430        sk.max_objectid = 999;
431        sk.max_type = 888;
432        sk.max_offset = 777;
433        sk.max_transid = 666;
434        let last = header(10, 20, 30);
435        advance_cursor(&mut sk, &last);
436        assert_eq!(sk.max_objectid, 999);
437        assert_eq!(sk.max_type, 888);
438        assert_eq!(sk.max_offset, 777);
439        assert_eq!(sk.max_transid, 666);
440    }
441}