Skip to main content

btrfs_uapi/
tree_search.rs

1//! # Generic B-tree search: walking any internal btrfs tree via `BTRFS_IOC_TREE_SEARCH`
2//!
3//! The kernel's tree search ioctl lets userspace read any internal btrfs tree
4//! (chunk, root, quota, …) by specifying a key range.  Items are returned in
5//! batches; [`tree_search`] advances the cursor automatically and calls a
6//! closure once per item until the range is exhausted.
7//!
8//! # Byte order
9//!
10//! [`SearchHeader`] fields (objectid, offset, type) are in host byte order:
11//! the kernel fills them in through the ioctl layer.  The `data` slice passed
12//! to the callback contains the raw on-disk item payload, which is
13//! **little-endian**; callers must use `u64::from_le_bytes` and friends when
14//! interpreting it.
15//!
16//! # Ioctl version
17//!
18//! This module provides two variants:
19//!
20//! - [`tree_search`] uses `BTRFS_IOC_TREE_SEARCH` (v1) with its fixed
21//!   3992-byte result buffer. Sufficient for all item types used by this crate.
22//! - [`tree_search_v2`] uses `BTRFS_IOC_TREE_SEARCH_V2` with a caller-chosen
23//!   buffer size. Useful when items may be larger than what v1 can return in a
24//!   single batch.
25
26use crate::raw::{
27    btrfs_ioc_tree_search, btrfs_ioc_tree_search_v2, btrfs_ioctl_search_args,
28    btrfs_ioctl_search_args_v2, btrfs_ioctl_search_header,
29    btrfs_ioctl_search_key,
30};
31use std::{
32    mem,
33    os::{fd::AsRawFd, unix::io::BorrowedFd},
34};
35
36/// Parameters specifying which items to return from a tree search.
37///
38/// The kernel searches a 136-bit key space ordered as
39/// `(objectid << 72) | (type << 64) | offset`.
40/// All items whose key falls in the inclusive range `[min_key, max_key]` are
41/// returned.
42///
43/// Build a key for common cases with [`SearchKey::for_type`] or
44/// [`SearchKey::for_objectid_range`].
45///
46/// Tree IDs and item type codes are the `BTRFS_*_OBJECTID` and `BTRFS_*_KEY`
47/// constants from `crate::raw`, cast to `u64` and `u32` respectively at the
48/// call site.
49#[derive(Debug, Clone)]
50pub struct SearchKey {
51    /// Tree to search — use a `BTRFS_*_TREE_OBJECTID` constant from `crate::raw`.
52    pub tree_id: u64,
53    pub min_objectid: u64,
54    pub max_objectid: u64,
55    /// Item type — use a `BTRFS_*_KEY` constant from `crate::raw`.
56    pub min_type: u32,
57    pub max_type: u32,
58    pub min_offset: u64,
59    pub max_offset: u64,
60    /// Filter on the transaction ID of the *metadata block* that holds the
61    /// item, not the transaction that created the item itself.
62    pub min_transid: u64,
63    pub max_transid: u64,
64}
65
66impl SearchKey {
67    /// Return all items of `item_type` in `tree_id`, across every objectid
68    /// and offset.
69    pub fn for_type(tree_id: u64, item_type: u32) -> Self {
70        Self {
71            tree_id,
72            min_objectid: 0,
73            max_objectid: u64::MAX,
74            min_type: item_type,
75            max_type: item_type,
76            min_offset: 0,
77            max_offset: u64::MAX,
78            min_transid: 0,
79            max_transid: u64::MAX,
80        }
81    }
82
83    /// Return all items of `item_type` in `tree_id` whose objectid falls in
84    /// `[min_objectid, max_objectid]`.
85    pub fn for_objectid_range(
86        tree_id: u64,
87        item_type: u32,
88        min_objectid: u64,
89        max_objectid: u64,
90    ) -> Self {
91        Self {
92            min_objectid,
93            max_objectid,
94            ..Self::for_type(tree_id, item_type)
95        }
96    }
97}
98
99/// Metadata returned for each item found by [`tree_search`].
100///
101/// The header fields are in host byte order (the kernel fills them in through
102/// the ioctl layer).  The accompanying `data` slice passed to the callback is
103/// the raw on-disk item payload and is in **little-endian** byte order.
104#[derive(Debug, Clone, Copy)]
105pub struct SearchHeader {
106    pub transid: u64,
107    pub objectid: u64,
108    pub offset: u64,
109    /// Item type (the `type` field of the btrfs key).
110    pub item_type: u32,
111    /// Length in bytes of the item's data payload.
112    pub len: u32,
113}
114
115/// Number of items to request per ioctl call.
116const ITEMS_PER_BATCH: u32 = 4096;
117
118/// Size of `btrfs_ioctl_search_header` as a compile-time constant.
119const SEARCH_HEADER_SIZE: usize = mem::size_of::<btrfs_ioctl_search_header>();
120
121/// Walk every item in the tree that falls within `key`, calling `f` once for
122/// each one.
123///
124/// `f` receives:
125/// * a reference to the item's [`SearchHeader`] (objectid, offset, type, …)
126/// * a byte slice of the item's raw on-disk data payload (little-endian)
127///
128/// The search stops early and the error is returned if:
129/// * `f` returns `Err(_)`
130/// * the underlying `BTRFS_IOC_TREE_SEARCH` ioctl fails
131///
132/// Returns `Ok(())` when the entire requested range has been scanned.
133///
134/// # Privileges
135///
136/// Most trees require `CAP_SYS_ADMIN`.  The subvolume tree of the inode
137/// belonging to `fd` can be searched without elevated privileges by setting
138/// `key.tree_id = 0`.
139pub fn tree_search(
140    fd: BorrowedFd,
141    key: SearchKey,
142    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
143) -> nix::Result<()> {
144    let mut args: btrfs_ioctl_search_args = unsafe { mem::zeroed() };
145
146    fill_search_key(&mut args.key, &key);
147
148    loop {
149        args.key.nr_items = ITEMS_PER_BATCH;
150
151        unsafe { btrfs_ioc_tree_search(fd.as_raw_fd(), &mut args) }?;
152
153        let nr = args.key.nr_items;
154        if nr == 0 {
155            break;
156        }
157
158        // Walk the result buffer.  We use raw pointer reads to avoid borrow
159        // checker conflicts: args.buf (read-only after the ioctl) and
160        // args.key (mutated below for cursor advancement) are separate fields,
161        // but a Rust reference to one would prevent mutation of the other.
162        //
163        // SAFETY:
164        // * The ioctl has returned successfully, so args.buf contains nr valid
165        //   (header, data) pairs totalling at most args.buf.len() bytes.
166        // * buf_base is derived from args.buf which lives for the entire loop
167        //   body; it is not invalidated until args is dropped.
168        // * All raw reads are bounds-checked before dereferencing.
169        // * The `data` slices passed to `f` do not outlive this loop
170        //   iteration — `f` takes `&[u8]`, not `&'static [u8]`.
171        let buf_base: *const u8 = args.buf.as_ptr().cast();
172        let buf_cap = args.buf.len();
173
174        let mut off = 0usize;
175        let mut last = SearchHeader {
176            transid: 0,
177            objectid: 0,
178            offset: 0,
179            item_type: 0,
180            len: 0,
181        };
182
183        for _ in 0..nr {
184            if off + SEARCH_HEADER_SIZE > buf_cap {
185                return Err(nix::errno::Errno::EOVERFLOW);
186            }
187            let raw_hdr: btrfs_ioctl_search_header = unsafe {
188                (buf_base.add(off) as *const btrfs_ioctl_search_header)
189                    .read_unaligned()
190            };
191            let hdr = SearchHeader {
192                transid: raw_hdr.transid,
193                objectid: raw_hdr.objectid,
194                offset: raw_hdr.offset,
195                item_type: raw_hdr.type_,
196                len: raw_hdr.len,
197            };
198            off += SEARCH_HEADER_SIZE;
199
200            let data_len = hdr.len as usize;
201            if off + data_len > buf_cap {
202                return Err(nix::errno::Errno::EOVERFLOW);
203            }
204            let data: &[u8] = unsafe {
205                std::slice::from_raw_parts(buf_base.add(off), data_len)
206            };
207            off += data_len;
208
209            f(&hdr, data)?;
210            last = hdr;
211        }
212
213        if !advance_cursor(&mut args.key, &last) {
214            break;
215        }
216    }
217
218    Ok(())
219}
220
221/// Default buffer size for [`tree_search_v2`]: 64 KiB.
222const DEFAULT_V2_BUF_SIZE: usize = 64 * 1024;
223
224/// Like [`tree_search`] but uses `BTRFS_IOC_TREE_SEARCH_V2` with a larger
225/// result buffer.
226///
227/// `buf_size` controls the buffer size in bytes (default 64 KiB if `None`).
228/// The v2 ioctl is otherwise identical to v1 but can return more data per
229/// batch, reducing the number of round-trips for large result sets.
230///
231/// If `buf_size` is too small for even a single item, the kernel returns
232/// `EOVERFLOW` and sets `buf_size` to the required minimum. This function
233/// automatically retries with the larger buffer.
234pub fn tree_search_v2(
235    fd: BorrowedFd,
236    key: SearchKey,
237    buf_size: Option<usize>,
238    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
239) -> nix::Result<()> {
240    let base_size = mem::size_of::<btrfs_ioctl_search_args_v2>();
241    let mut capacity = buf_size.unwrap_or(DEFAULT_V2_BUF_SIZE);
242
243    // Allocate as Vec<u64> for 8-byte alignment.
244    let alloc_bytes = base_size + capacity;
245    let num_u64s = alloc_bytes.div_ceil(mem::size_of::<u64>());
246    let mut buf = vec![0u64; num_u64s];
247
248    // SAFETY: buf is correctly sized and aligned for btrfs_ioctl_search_args_v2.
249    let args_ptr = buf.as_mut_ptr() as *mut btrfs_ioctl_search_args_v2;
250    unsafe {
251        fill_search_key(&mut (*args_ptr).key, &key);
252    }
253
254    loop {
255        unsafe {
256            (*args_ptr).key.nr_items = ITEMS_PER_BATCH;
257            (*args_ptr).buf_size = capacity as u64;
258        }
259
260        match unsafe {
261            btrfs_ioc_tree_search_v2(fd.as_raw_fd(), &mut *args_ptr)
262        } {
263            Ok(_) => {}
264            Err(nix::errno::Errno::EOVERFLOW) => {
265                // Kernel tells us the needed size via buf_size.
266                let needed = unsafe { (*args_ptr).buf_size } as usize;
267                if needed <= capacity {
268                    return Err(nix::errno::Errno::EOVERFLOW);
269                }
270                capacity = needed;
271                let alloc_bytes = base_size + capacity;
272                let num_u64s = alloc_bytes.div_ceil(mem::size_of::<u64>());
273                buf.resize(num_u64s, 0);
274                // args_ptr must be refreshed after reallocation.
275                let args_ptr_new =
276                    buf.as_mut_ptr() as *mut btrfs_ioctl_search_args_v2;
277                unsafe {
278                    (*args_ptr_new).key.nr_items = ITEMS_PER_BATCH;
279                    (*args_ptr_new).buf_size = capacity as u64;
280                    btrfs_ioc_tree_search_v2(
281                        fd.as_raw_fd(),
282                        &mut *args_ptr_new,
283                    )?;
284                }
285                // Fall through to process results with the new pointer.
286                // Update our local for the rest of the loop.
287                let _ = args_ptr_new;
288            }
289            Err(e) => return Err(e),
290        }
291
292        // Re-derive pointer after potential reallocation.
293        let args_ptr = buf.as_mut_ptr() as *mut btrfs_ioctl_search_args_v2;
294
295        let nr = unsafe { (*args_ptr).key.nr_items };
296        if nr == 0 {
297            break;
298        }
299
300        // The result data starts right after the base struct (at the
301        // flexible array member `buf[]`).
302        let data_base: *const u8 =
303            unsafe { (args_ptr as *const u8).add(base_size) };
304
305        let mut off = 0usize;
306        let mut last = SearchHeader {
307            transid: 0,
308            objectid: 0,
309            offset: 0,
310            item_type: 0,
311            len: 0,
312        };
313
314        for _ in 0..nr {
315            if off + SEARCH_HEADER_SIZE > capacity {
316                return Err(nix::errno::Errno::EOVERFLOW);
317            }
318            let raw_hdr: btrfs_ioctl_search_header = unsafe {
319                (data_base.add(off) as *const btrfs_ioctl_search_header)
320                    .read_unaligned()
321            };
322            let hdr = SearchHeader {
323                transid: raw_hdr.transid,
324                objectid: raw_hdr.objectid,
325                offset: raw_hdr.offset,
326                item_type: raw_hdr.type_,
327                len: raw_hdr.len,
328            };
329            off += SEARCH_HEADER_SIZE;
330
331            let data_len = hdr.len as usize;
332            if off + data_len > capacity {
333                return Err(nix::errno::Errno::EOVERFLOW);
334            }
335            let data: &[u8] = unsafe {
336                std::slice::from_raw_parts(data_base.add(off), data_len)
337            };
338            off += data_len;
339
340            f(&hdr, data)?;
341            last = hdr;
342        }
343
344        if !advance_cursor(unsafe { &mut (*args_ptr).key }, &last) {
345            break;
346        }
347    }
348
349    Ok(())
350}
351
352fn fill_search_key(sk: &mut btrfs_ioctl_search_key, key: &SearchKey) {
353    sk.tree_id = key.tree_id;
354    sk.min_objectid = key.min_objectid;
355    sk.max_objectid = key.max_objectid;
356    sk.min_type = key.min_type;
357    sk.max_type = key.max_type;
358    sk.min_offset = key.min_offset;
359    sk.max_offset = key.max_offset;
360    sk.min_transid = key.min_transid;
361    sk.max_transid = key.max_transid;
362}
363
364/// Advance the search cursor past `last` so the next batch begins from the
365/// item immediately after it in the 136-bit key space
366/// `(objectid << 72) | (type << 64) | offset`.
367///
368/// The kernel interprets `(min_objectid, min_type, min_offset)` as a compound
369/// tuple key, so all three fields must be updated together to point past the
370/// last returned item.  Advancing only `min_offset` while leaving
371/// `min_objectid` at its original value would cause items from lower objectids
372/// that were already returned to satisfy the new minimum and be yielded again.
373///
374/// Returns `false` when the objectid also overflows, meaning the full key
375/// space has been exhausted.
376fn advance_cursor(
377    sk: &mut btrfs_ioctl_search_key,
378    last: &SearchHeader,
379) -> bool {
380    let (new_offset, offset_overflow) = last.offset.overflowing_add(1);
381    if !offset_overflow {
382        sk.min_objectid = last.objectid;
383        sk.min_type = last.item_type;
384        sk.min_offset = new_offset;
385        return true;
386    }
387
388    sk.min_offset = 0;
389    let (new_type, type_overflow) = last.item_type.overflowing_add(1);
390    if !type_overflow {
391        sk.min_objectid = last.objectid;
392        sk.min_type = new_type;
393        return true;
394    }
395
396    sk.min_type = 0;
397    let (new_oid, oid_overflow) = last.objectid.overflowing_add(1);
398    if oid_overflow {
399        return false;
400    }
401    sk.min_objectid = new_oid;
402    true
403}
404
405#[cfg(test)]
406mod tests {
407    use super::*;
408
409    fn header(objectid: u64, item_type: u32, offset: u64) -> SearchHeader {
410        SearchHeader {
411            transid: 0,
412            objectid,
413            offset,
414            item_type,
415            len: 0,
416        }
417    }
418
419    fn zeroed_search_key() -> btrfs_ioctl_search_key {
420        unsafe { mem::zeroed() }
421    }
422
423    // --- SearchKey constructors ---
424
425    #[test]
426    fn for_type_covers_all_objectids_and_offsets() {
427        let sk = SearchKey::for_type(5, 132);
428        assert_eq!(sk.tree_id, 5);
429        assert_eq!(sk.min_objectid, 0);
430        assert_eq!(sk.max_objectid, u64::MAX);
431        assert_eq!(sk.min_type, 132);
432        assert_eq!(sk.max_type, 132);
433        assert_eq!(sk.min_offset, 0);
434        assert_eq!(sk.max_offset, u64::MAX);
435        assert_eq!(sk.min_transid, 0);
436        assert_eq!(sk.max_transid, u64::MAX);
437    }
438
439    #[test]
440    fn for_objectid_range_restricts_objectids() {
441        let sk = SearchKey::for_objectid_range(1, 84, 100, 200);
442        assert_eq!(sk.tree_id, 1);
443        assert_eq!(sk.min_objectid, 100);
444        assert_eq!(sk.max_objectid, 200);
445        assert_eq!(sk.min_type, 84);
446        assert_eq!(sk.max_type, 84);
447        assert_eq!(sk.min_offset, 0);
448        assert_eq!(sk.max_offset, u64::MAX);
449    }
450
451    // --- fill_search_key ---
452
453    #[test]
454    fn fill_search_key_copies_all_fields() {
455        let key = SearchKey {
456            tree_id: 1,
457            min_objectid: 10,
458            max_objectid: 20,
459            min_type: 30,
460            max_type: 40,
461            min_offset: 50,
462            max_offset: 60,
463            min_transid: 70,
464            max_transid: 80,
465        };
466        let mut sk = zeroed_search_key();
467        fill_search_key(&mut sk, &key);
468        assert_eq!(sk.tree_id, 1);
469        assert_eq!(sk.min_objectid, 10);
470        assert_eq!(sk.max_objectid, 20);
471        assert_eq!(sk.min_type, 30);
472        assert_eq!(sk.max_type, 40);
473        assert_eq!(sk.min_offset, 50);
474        assert_eq!(sk.max_offset, 60);
475        assert_eq!(sk.min_transid, 70);
476        assert_eq!(sk.max_transid, 80);
477    }
478
479    // --- advance_cursor: normal case ---
480
481    #[test]
482    fn advance_increments_offset() {
483        let mut sk = zeroed_search_key();
484        let last = header(256, 132, 100);
485        assert!(advance_cursor(&mut sk, &last));
486        assert_eq!(sk.min_objectid, 256);
487        assert_eq!(sk.min_type, 132);
488        assert_eq!(sk.min_offset, 101);
489    }
490
491    #[test]
492    fn advance_tracks_objectid_from_last_item() {
493        // This is the bug that was fixed: the cursor must track the last
494        // item's objectid, not leave min_objectid at its original value.
495        let mut sk = zeroed_search_key();
496        sk.min_objectid = 100; // original search started at 100
497        let last = header(300, 132, 50); // batch ended at objectid 300
498        assert!(advance_cursor(&mut sk, &last));
499        assert_eq!(sk.min_objectid, 300, "must track last item's objectid");
500        assert_eq!(sk.min_type, 132);
501        assert_eq!(sk.min_offset, 51);
502    }
503
504    #[test]
505    fn advance_tracks_type_from_last_item() {
506        let mut sk = zeroed_search_key();
507        let last = header(256, 180, 42);
508        assert!(advance_cursor(&mut sk, &last));
509        assert_eq!(sk.min_objectid, 256);
510        assert_eq!(sk.min_type, 180);
511        assert_eq!(sk.min_offset, 43);
512    }
513
514    // --- advance_cursor: offset overflow ---
515
516    #[test]
517    fn advance_offset_overflow_bumps_type() {
518        let mut sk = zeroed_search_key();
519        let last = header(256, 132, u64::MAX);
520        assert!(advance_cursor(&mut sk, &last));
521        assert_eq!(sk.min_objectid, 256);
522        assert_eq!(sk.min_type, 133);
523        assert_eq!(sk.min_offset, 0);
524    }
525
526    // --- advance_cursor: type overflow ---
527
528    #[test]
529    fn advance_type_overflow_bumps_objectid() {
530        let mut sk = zeroed_search_key();
531        let last = header(256, u32::MAX, u64::MAX);
532        assert!(advance_cursor(&mut sk, &last));
533        assert_eq!(sk.min_objectid, 257);
534        assert_eq!(sk.min_type, 0);
535        assert_eq!(sk.min_offset, 0);
536    }
537
538    // --- advance_cursor: full keyspace exhaustion ---
539
540    #[test]
541    fn advance_all_overflow_returns_false() {
542        let mut sk = zeroed_search_key();
543        let last = header(u64::MAX, u32::MAX, u64::MAX);
544        assert!(!advance_cursor(&mut sk, &last));
545    }
546
547    // --- advance_cursor: edge cases ---
548
549    #[test]
550    fn advance_zero_key() {
551        let mut sk = zeroed_search_key();
552        let last = header(0, 0, 0);
553        assert!(advance_cursor(&mut sk, &last));
554        assert_eq!(sk.min_objectid, 0);
555        assert_eq!(sk.min_type, 0);
556        assert_eq!(sk.min_offset, 1);
557    }
558
559    #[test]
560    fn advance_objectid_max_type_zero_offset_max() {
561        // offset overflows, type bumps to 1
562        let mut sk = zeroed_search_key();
563        let last = header(u64::MAX, 0, u64::MAX);
564        assert!(advance_cursor(&mut sk, &last));
565        assert_eq!(sk.min_objectid, u64::MAX);
566        assert_eq!(sk.min_type, 1);
567        assert_eq!(sk.min_offset, 0);
568    }
569
570    #[test]
571    fn advance_preserves_unrelated_search_key_fields() {
572        let mut sk = zeroed_search_key();
573        sk.max_objectid = 999;
574        sk.max_type = 888;
575        sk.max_offset = 777;
576        sk.max_transid = 666;
577        let last = header(10, 20, 30);
578        advance_cursor(&mut sk, &last);
579        assert_eq!(sk.max_objectid, 999);
580        assert_eq!(sk.max_type, 888);
581        assert_eq!(sk.max_offset, 777);
582        assert_eq!(sk.max_transid, 666);
583    }
584}