Skip to main content

btrfs_uapi/
tree_search.rs

1//! # Generic B-tree search: walking any internal btrfs tree via `BTRFS_IOC_TREE_SEARCH`
2//!
3//! The kernel's tree search ioctl lets userspace read any internal btrfs tree
4//! (chunk, root, quota, …) by specifying a key range.  Items are returned in
5//! batches; [`tree_search`] advances the cursor automatically and calls a
6//! closure once per item until the range is exhausted.
7//!
8//! # Byte order
9//!
10//! [`SearchHeader`] fields (objectid, offset, type) are in host byte order:
11//! the kernel fills them in through the ioctl layer.  The `data` slice passed
12//! to the callback contains the raw on-disk item payload, which is
13//! **little-endian**; callers must use `u64::from_le_bytes` and friends when
14//! interpreting it.
15//!
16//! # Ioctl version
17//!
18//! This module provides two variants:
19//!
20//! - [`tree_search`] uses `BTRFS_IOC_TREE_SEARCH` (v1) with its fixed
21//!   3992-byte result buffer. Sufficient for all item types used by this crate.
22//! - [`tree_search_v2`] uses `BTRFS_IOC_TREE_SEARCH_V2` with a caller-chosen
23//!   buffer size. Useful when items may be larger than what v1 can return in a
24//!   single batch.
25
26use crate::raw::{
27    btrfs_ioc_tree_search, btrfs_ioc_tree_search_v2, btrfs_ioctl_search_args,
28    btrfs_ioctl_search_args_v2, btrfs_ioctl_search_header,
29    btrfs_ioctl_search_key,
30};
31use std::{
32    mem,
33    os::{fd::AsRawFd, unix::io::BorrowedFd},
34};
35
36/// Parameters specifying which items to return from a tree search.
37///
38/// The kernel searches a 136-bit key space ordered as
39/// `(objectid << 72) | (type << 64) | offset`.
40/// All items whose key falls in the inclusive range `[min_key, max_key]` are
41/// returned.
42///
43/// Build a key for common cases with [`SearchKey::for_type`] or
44/// [`SearchKey::for_objectid_range`].
45///
46/// Tree IDs and item type codes are the `BTRFS_*_OBJECTID` and `BTRFS_*_KEY`
47/// constants from `crate::raw`, cast to `u64` and `u32` respectively at the
48/// call site.
49#[derive(Debug, Clone)]
50pub struct SearchKey {
51    /// Tree to search — use a `BTRFS_*_TREE_OBJECTID` constant from `crate::raw`.
52    pub tree_id: u64,
53    pub min_objectid: u64,
54    pub max_objectid: u64,
55    /// Item type — use a `BTRFS_*_KEY` constant from `crate::raw`.
56    pub min_type: u32,
57    pub max_type: u32,
58    pub min_offset: u64,
59    pub max_offset: u64,
60    /// Filter on the transaction ID of the *metadata block* that holds the
61    /// item, not the transaction that created the item itself.
62    pub min_transid: u64,
63    pub max_transid: u64,
64}
65
66impl SearchKey {
67    /// Return all items of `item_type` in `tree_id`, across every objectid
68    /// and offset.
69    #[must_use]
70    pub fn for_type(tree_id: u64, item_type: u32) -> Self {
71        Self {
72            tree_id,
73            min_objectid: 0,
74            max_objectid: u64::MAX,
75            min_type: item_type,
76            max_type: item_type,
77            min_offset: 0,
78            max_offset: u64::MAX,
79            min_transid: 0,
80            max_transid: u64::MAX,
81        }
82    }
83
84    /// Return all items of `item_type` in `tree_id` whose objectid falls in
85    /// `[min_objectid, max_objectid]`.
86    #[must_use]
87    pub fn for_objectid_range(
88        tree_id: u64,
89        item_type: u32,
90        min_objectid: u64,
91        max_objectid: u64,
92    ) -> Self {
93        Self {
94            min_objectid,
95            max_objectid,
96            ..Self::for_type(tree_id, item_type)
97        }
98    }
99}
100
101/// Metadata returned for each item found by [`tree_search`].
102///
103/// The header fields are in host byte order (the kernel fills them in through
104/// the ioctl layer).  The accompanying `data` slice passed to the callback is
105/// the raw on-disk item payload and is in **little-endian** byte order.
106#[derive(Debug, Clone, Copy)]
107pub struct SearchHeader {
108    pub transid: u64,
109    pub objectid: u64,
110    pub offset: u64,
111    /// Item type (the `type` field of the btrfs key).
112    pub item_type: u32,
113    /// Length in bytes of the item's data payload.
114    pub len: u32,
115}
116
117/// Number of items to request per ioctl call.
118const ITEMS_PER_BATCH: u32 = 4096;
119
120/// Size of `btrfs_ioctl_search_header` as a compile-time constant.
121const SEARCH_HEADER_SIZE: usize = mem::size_of::<btrfs_ioctl_search_header>();
122
123/// Walk every item in the tree that falls within `key`, calling `f` once for
124/// each one.
125///
126/// `f` receives:
127/// * a reference to the item's [`SearchHeader`] (objectid, offset, type, …)
128/// * a byte slice of the item's raw on-disk data payload (little-endian)
129///
130/// The search stops early and the error is returned if:
131/// * `f` returns `Err(_)`
132/// * the underlying `BTRFS_IOC_TREE_SEARCH` ioctl fails
133///
134/// Returns `Ok(())` when the entire requested range has been scanned.
135///
136/// # Privileges
137///
138/// Most trees require `CAP_SYS_ADMIN`.  The subvolume tree of the inode
139/// belonging to `fd` can be searched without elevated privileges by setting
140/// `key.tree_id = 0`.
141pub fn tree_search(
142    fd: BorrowedFd,
143    key: SearchKey,
144    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
145) -> nix::Result<()> {
146    let mut args: btrfs_ioctl_search_args = unsafe { mem::zeroed() };
147
148    fill_search_key(&mut args.key, &key);
149
150    loop {
151        args.key.nr_items = ITEMS_PER_BATCH;
152
153        unsafe { btrfs_ioc_tree_search(fd.as_raw_fd(), &raw mut args) }?;
154
155        let nr = args.key.nr_items;
156        if nr == 0 {
157            break;
158        }
159
160        // Walk the result buffer.  We use raw pointer reads to avoid borrow
161        // checker conflicts: args.buf (read-only after the ioctl) and
162        // args.key (mutated below for cursor advancement) are separate fields,
163        // but a Rust reference to one would prevent mutation of the other.
164        //
165        // SAFETY:
166        // * The ioctl has returned successfully, so args.buf contains nr valid
167        //   (header, data) pairs totalling at most args.buf.len() bytes.
168        // * buf_base is derived from args.buf which lives for the entire loop
169        //   body; it is not invalidated until args is dropped.
170        // * All raw reads are bounds-checked before dereferencing.
171        // * The `data` slices passed to `f` do not outlive this loop
172        //   iteration — `f` takes `&[u8]`, not `&'static [u8]`.
173        let buf_base: *const u8 = args.buf.as_ptr().cast();
174        let buf_cap = args.buf.len();
175
176        let mut off = 0usize;
177        let mut last = SearchHeader {
178            transid: 0,
179            objectid: 0,
180            offset: 0,
181            item_type: 0,
182            len: 0,
183        };
184
185        for _ in 0..nr {
186            if off + SEARCH_HEADER_SIZE > buf_cap {
187                return Err(nix::errno::Errno::EOVERFLOW);
188            }
189            let raw_hdr: btrfs_ioctl_search_header = unsafe {
190                buf_base
191                    .add(off)
192                    .cast::<btrfs_ioctl_search_header>()
193                    .read_unaligned()
194            };
195            let hdr = SearchHeader {
196                transid: raw_hdr.transid,
197                objectid: raw_hdr.objectid,
198                offset: raw_hdr.offset,
199                item_type: raw_hdr.type_,
200                len: raw_hdr.len,
201            };
202            off += SEARCH_HEADER_SIZE;
203
204            let data_len = hdr.len as usize;
205            if off + data_len > buf_cap {
206                return Err(nix::errno::Errno::EOVERFLOW);
207            }
208            let data: &[u8] = unsafe {
209                std::slice::from_raw_parts(buf_base.add(off), data_len)
210            };
211            off += data_len;
212
213            f(&hdr, data)?;
214            last = hdr;
215        }
216
217        if !advance_cursor(&mut args.key, &last) {
218            break;
219        }
220    }
221
222    Ok(())
223}
224
225/// Default buffer size for [`tree_search_v2`]: 64 KiB.
226const DEFAULT_V2_BUF_SIZE: usize = 64 * 1024;
227
228/// Like [`tree_search`] but uses `BTRFS_IOC_TREE_SEARCH_V2` with a larger
229/// result buffer.
230///
231/// `buf_size` controls the buffer size in bytes (default 64 KiB if `None`).
232/// The v2 ioctl is otherwise identical to v1 but can return more data per
233/// batch, reducing the number of round-trips for large result sets.
234///
235/// If `buf_size` is too small for even a single item, the kernel returns
236/// `EOVERFLOW` and sets `buf_size` to the required minimum. This function
237/// automatically retries with the larger buffer.
238pub fn tree_search_v2(
239    fd: BorrowedFd,
240    key: SearchKey,
241    buf_size: Option<usize>,
242    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
243) -> nix::Result<()> {
244    let base_size = mem::size_of::<btrfs_ioctl_search_args_v2>();
245    let mut capacity = buf_size.unwrap_or(DEFAULT_V2_BUF_SIZE);
246
247    // Allocate as Vec<u64> for 8-byte alignment.
248    let alloc_bytes = base_size + capacity;
249    let num_u64s = alloc_bytes.div_ceil(mem::size_of::<u64>());
250    let mut buf = vec![0u64; num_u64s];
251
252    // SAFETY: buf is correctly sized and aligned for btrfs_ioctl_search_args_v2.
253    let args_ptr = buf.as_mut_ptr().cast::<btrfs_ioctl_search_args_v2>();
254    unsafe {
255        fill_search_key(&mut (*args_ptr).key, &key);
256    }
257
258    loop {
259        unsafe {
260            (*args_ptr).key.nr_items = ITEMS_PER_BATCH;
261            (*args_ptr).buf_size = capacity as u64;
262        }
263
264        match unsafe {
265            btrfs_ioc_tree_search_v2(fd.as_raw_fd(), &raw mut *args_ptr)
266        } {
267            Ok(_) => {}
268            Err(nix::errno::Errno::EOVERFLOW) => {
269                // Kernel tells us the needed size via buf_size.
270                let needed = unsafe { (*args_ptr).buf_size } as usize;
271                if needed <= capacity {
272                    return Err(nix::errno::Errno::EOVERFLOW);
273                }
274                capacity = needed;
275                let alloc_bytes = base_size + capacity;
276                let num_u64s = alloc_bytes.div_ceil(mem::size_of::<u64>());
277                buf.resize(num_u64s, 0);
278                // args_ptr must be refreshed after reallocation.
279                let args_ptr_new =
280                    buf.as_mut_ptr().cast::<btrfs_ioctl_search_args_v2>();
281                unsafe {
282                    (*args_ptr_new).key.nr_items = ITEMS_PER_BATCH;
283                    (*args_ptr_new).buf_size = capacity as u64;
284                    btrfs_ioc_tree_search_v2(
285                        fd.as_raw_fd(),
286                        &raw mut *args_ptr_new,
287                    )?;
288                }
289                // Fall through to process results with the new pointer.
290                // Update our local for the rest of the loop.
291                let _ = args_ptr_new;
292            }
293            Err(e) => return Err(e),
294        }
295
296        // Re-derive pointer after potential reallocation.
297        let args_ptr = buf.as_mut_ptr().cast::<btrfs_ioctl_search_args_v2>();
298
299        let nr = unsafe { (*args_ptr).key.nr_items };
300        if nr == 0 {
301            break;
302        }
303
304        // The result data starts right after the base struct (at the
305        // flexible array member `buf[]`).
306        let data_base: *const u8 =
307            unsafe { (args_ptr as *const u8).add(base_size) };
308
309        let mut off = 0usize;
310        let mut last = SearchHeader {
311            transid: 0,
312            objectid: 0,
313            offset: 0,
314            item_type: 0,
315            len: 0,
316        };
317
318        for _ in 0..nr {
319            if off + SEARCH_HEADER_SIZE > capacity {
320                return Err(nix::errno::Errno::EOVERFLOW);
321            }
322            let raw_hdr: btrfs_ioctl_search_header = unsafe {
323                data_base
324                    .add(off)
325                    .cast::<btrfs_ioctl_search_header>()
326                    .read_unaligned()
327            };
328            let hdr = SearchHeader {
329                transid: raw_hdr.transid,
330                objectid: raw_hdr.objectid,
331                offset: raw_hdr.offset,
332                item_type: raw_hdr.type_,
333                len: raw_hdr.len,
334            };
335            off += SEARCH_HEADER_SIZE;
336
337            let data_len = hdr.len as usize;
338            if off + data_len > capacity {
339                return Err(nix::errno::Errno::EOVERFLOW);
340            }
341            let data: &[u8] = unsafe {
342                std::slice::from_raw_parts(data_base.add(off), data_len)
343            };
344            off += data_len;
345
346            f(&hdr, data)?;
347            last = hdr;
348        }
349
350        if !advance_cursor(unsafe { &mut (*args_ptr).key }, &last) {
351            break;
352        }
353    }
354
355    Ok(())
356}
357
358fn fill_search_key(sk: &mut btrfs_ioctl_search_key, key: &SearchKey) {
359    sk.tree_id = key.tree_id;
360    sk.min_objectid = key.min_objectid;
361    sk.max_objectid = key.max_objectid;
362    sk.min_type = key.min_type;
363    sk.max_type = key.max_type;
364    sk.min_offset = key.min_offset;
365    sk.max_offset = key.max_offset;
366    sk.min_transid = key.min_transid;
367    sk.max_transid = key.max_transid;
368}
369
370/// Advance the search cursor past `last` so the next batch begins from the
371/// item immediately after it in the 136-bit key space
372/// `(objectid << 72) | (type << 64) | offset`.
373///
374/// The kernel interprets `(min_objectid, min_type, min_offset)` as a compound
375/// tuple key, so all three fields must be updated together to point past the
376/// last returned item.  Advancing only `min_offset` while leaving
377/// `min_objectid` at its original value would cause items from lower objectids
378/// that were already returned to satisfy the new minimum and be yielded again.
379///
380/// Returns `false` when the objectid also overflows, meaning the full key
381/// space has been exhausted.
382fn advance_cursor(
383    sk: &mut btrfs_ioctl_search_key,
384    last: &SearchHeader,
385) -> bool {
386    let (new_offset, offset_overflow) = last.offset.overflowing_add(1);
387    if !offset_overflow {
388        sk.min_objectid = last.objectid;
389        sk.min_type = last.item_type;
390        sk.min_offset = new_offset;
391        return true;
392    }
393
394    sk.min_offset = 0;
395    let (new_type, type_overflow) = last.item_type.overflowing_add(1);
396    if !type_overflow {
397        sk.min_objectid = last.objectid;
398        sk.min_type = new_type;
399        return true;
400    }
401
402    sk.min_type = 0;
403    let (new_oid, oid_overflow) = last.objectid.overflowing_add(1);
404    if oid_overflow {
405        return false;
406    }
407    sk.min_objectid = new_oid;
408    true
409}
410
411#[cfg(test)]
412mod tests {
413    use super::*;
414
415    fn header(objectid: u64, item_type: u32, offset: u64) -> SearchHeader {
416        SearchHeader {
417            transid: 0,
418            objectid,
419            offset,
420            item_type,
421            len: 0,
422        }
423    }
424
425    fn zeroed_search_key() -> btrfs_ioctl_search_key {
426        unsafe { mem::zeroed() }
427    }
428
429    // --- SearchKey constructors ---
430
431    #[test]
432    fn for_type_covers_all_objectids_and_offsets() {
433        let sk = SearchKey::for_type(5, 132);
434        assert_eq!(sk.tree_id, 5);
435        assert_eq!(sk.min_objectid, 0);
436        assert_eq!(sk.max_objectid, u64::MAX);
437        assert_eq!(sk.min_type, 132);
438        assert_eq!(sk.max_type, 132);
439        assert_eq!(sk.min_offset, 0);
440        assert_eq!(sk.max_offset, u64::MAX);
441        assert_eq!(sk.min_transid, 0);
442        assert_eq!(sk.max_transid, u64::MAX);
443    }
444
445    #[test]
446    fn for_objectid_range_restricts_objectids() {
447        let sk = SearchKey::for_objectid_range(1, 84, 100, 200);
448        assert_eq!(sk.tree_id, 1);
449        assert_eq!(sk.min_objectid, 100);
450        assert_eq!(sk.max_objectid, 200);
451        assert_eq!(sk.min_type, 84);
452        assert_eq!(sk.max_type, 84);
453        assert_eq!(sk.min_offset, 0);
454        assert_eq!(sk.max_offset, u64::MAX);
455    }
456
457    // --- fill_search_key ---
458
459    #[test]
460    fn fill_search_key_copies_all_fields() {
461        let key = SearchKey {
462            tree_id: 1,
463            min_objectid: 10,
464            max_objectid: 20,
465            min_type: 30,
466            max_type: 40,
467            min_offset: 50,
468            max_offset: 60,
469            min_transid: 70,
470            max_transid: 80,
471        };
472        let mut sk = zeroed_search_key();
473        fill_search_key(&mut sk, &key);
474        assert_eq!(sk.tree_id, 1);
475        assert_eq!(sk.min_objectid, 10);
476        assert_eq!(sk.max_objectid, 20);
477        assert_eq!(sk.min_type, 30);
478        assert_eq!(sk.max_type, 40);
479        assert_eq!(sk.min_offset, 50);
480        assert_eq!(sk.max_offset, 60);
481        assert_eq!(sk.min_transid, 70);
482        assert_eq!(sk.max_transid, 80);
483    }
484
485    // --- advance_cursor: normal case ---
486
487    #[test]
488    fn advance_increments_offset() {
489        let mut sk = zeroed_search_key();
490        let last = header(256, 132, 100);
491        assert!(advance_cursor(&mut sk, &last));
492        assert_eq!(sk.min_objectid, 256);
493        assert_eq!(sk.min_type, 132);
494        assert_eq!(sk.min_offset, 101);
495    }
496
497    #[test]
498    fn advance_tracks_objectid_from_last_item() {
499        // This is the bug that was fixed: the cursor must track the last
500        // item's objectid, not leave min_objectid at its original value.
501        let mut sk = zeroed_search_key();
502        sk.min_objectid = 100; // original search started at 100
503        let last = header(300, 132, 50); // batch ended at objectid 300
504        assert!(advance_cursor(&mut sk, &last));
505        assert_eq!(sk.min_objectid, 300, "must track last item's objectid");
506        assert_eq!(sk.min_type, 132);
507        assert_eq!(sk.min_offset, 51);
508    }
509
510    #[test]
511    fn advance_tracks_type_from_last_item() {
512        let mut sk = zeroed_search_key();
513        let last = header(256, 180, 42);
514        assert!(advance_cursor(&mut sk, &last));
515        assert_eq!(sk.min_objectid, 256);
516        assert_eq!(sk.min_type, 180);
517        assert_eq!(sk.min_offset, 43);
518    }
519
520    // --- advance_cursor: offset overflow ---
521
522    #[test]
523    fn advance_offset_overflow_bumps_type() {
524        let mut sk = zeroed_search_key();
525        let last = header(256, 132, u64::MAX);
526        assert!(advance_cursor(&mut sk, &last));
527        assert_eq!(sk.min_objectid, 256);
528        assert_eq!(sk.min_type, 133);
529        assert_eq!(sk.min_offset, 0);
530    }
531
532    // --- advance_cursor: type overflow ---
533
534    #[test]
535    fn advance_type_overflow_bumps_objectid() {
536        let mut sk = zeroed_search_key();
537        let last = header(256, u32::MAX, u64::MAX);
538        assert!(advance_cursor(&mut sk, &last));
539        assert_eq!(sk.min_objectid, 257);
540        assert_eq!(sk.min_type, 0);
541        assert_eq!(sk.min_offset, 0);
542    }
543
544    // --- advance_cursor: full keyspace exhaustion ---
545
546    #[test]
547    fn advance_all_overflow_returns_false() {
548        let mut sk = zeroed_search_key();
549        let last = header(u64::MAX, u32::MAX, u64::MAX);
550        assert!(!advance_cursor(&mut sk, &last));
551    }
552
553    // --- advance_cursor: edge cases ---
554
555    #[test]
556    fn advance_zero_key() {
557        let mut sk = zeroed_search_key();
558        let last = header(0, 0, 0);
559        assert!(advance_cursor(&mut sk, &last));
560        assert_eq!(sk.min_objectid, 0);
561        assert_eq!(sk.min_type, 0);
562        assert_eq!(sk.min_offset, 1);
563    }
564
565    #[test]
566    fn advance_objectid_max_type_zero_offset_max() {
567        // offset overflows, type bumps to 1
568        let mut sk = zeroed_search_key();
569        let last = header(u64::MAX, 0, u64::MAX);
570        assert!(advance_cursor(&mut sk, &last));
571        assert_eq!(sk.min_objectid, u64::MAX);
572        assert_eq!(sk.min_type, 1);
573        assert_eq!(sk.min_offset, 0);
574    }
575
576    #[test]
577    fn advance_preserves_unrelated_search_key_fields() {
578        let mut sk = zeroed_search_key();
579        sk.max_objectid = 999;
580        sk.max_type = 888;
581        sk.max_offset = 777;
582        sk.max_transid = 666;
583        let last = header(10, 20, 30);
584        advance_cursor(&mut sk, &last);
585        assert_eq!(sk.max_objectid, 999);
586        assert_eq!(sk.max_type, 888);
587        assert_eq!(sk.max_offset, 777);
588        assert_eq!(sk.max_transid, 666);
589    }
590}