Skip to main content

btrfs_uapi/
tree_search.rs

1//! # Generic B-tree search: walking any internal btrfs tree via `BTRFS_IOC_TREE_SEARCH`
2//!
3//! The kernel's tree search ioctl lets userspace read any internal btrfs tree
4//! (chunk, root, quota, …) by specifying a key range.  Items are returned in
5//! batches; [`tree_search`] advances the cursor automatically and calls a
6//! closure once per item until the range is exhausted.
7//!
8//! # Byte order
9//!
10//! [`SearchHeader`] fields (objectid, offset, type) are in host byte order:
11//! the kernel fills them in through the ioctl layer.  The `data` slice passed
12//! to the callback contains the raw on-disk item payload, which is
13//! **little-endian**; callers must use `u64::from_le_bytes` and friends when
14//! interpreting it.
15//!
16//! # Ioctl version
17//!
18//! This module provides two variants:
19//!
20//! - [`tree_search`] uses `BTRFS_IOC_TREE_SEARCH` (v1) with its fixed
21//!   3992-byte result buffer. Sufficient for all item types used by this crate.
22//! - [`tree_search_v2`] uses `BTRFS_IOC_TREE_SEARCH_V2` with a caller-chosen
23//!   buffer size. Useful when items may be larger than what v1 can return in a
24//!   single batch.
25
26use crate::raw::{
27    btrfs_ioc_tree_search, btrfs_ioc_tree_search_v2, btrfs_ioctl_search_args,
28    btrfs_ioctl_search_args_v2, btrfs_ioctl_search_header,
29    btrfs_ioctl_search_key,
30};
31use std::{
32    mem,
33    os::{fd::AsRawFd, unix::io::BorrowedFd},
34};
35
36/// Parameters specifying which items to return from a tree search.
37///
38/// The kernel searches a 136-bit key space ordered as
39/// `(objectid << 72) | (type << 64) | offset`.
40/// All items whose key falls in the inclusive range `[min_key, max_key]` are
41/// returned.
42///
43/// Build a key for common cases with [`SearchKey::for_type`] or
44/// [`SearchKey::for_objectid_range`].
45///
46/// Tree IDs and item type codes are the `BTRFS_*_OBJECTID` and `BTRFS_*_KEY`
47/// constants from `crate::raw`, cast to `u64` and `u32` respectively at the
48/// call site.
49#[derive(Debug, Clone)]
50pub struct SearchKey {
51    /// Tree to search: use a `BTRFS_*_TREE_OBJECTID` constant from `crate::raw`.
52    pub tree_id: u64,
53    /// Lower bound of the objectid range to search (inclusive).
54    pub min_objectid: u64,
55    /// Upper bound of the objectid range to search (inclusive).
56    pub max_objectid: u64,
57    /// Lower bound of the item type range: use a `BTRFS_*_KEY` constant.
58    pub min_type: u32,
59    /// Upper bound of the item type range: use a `BTRFS_*_KEY` constant.
60    pub max_type: u32,
61    /// Lower bound of the offset range to search (inclusive).
62    pub min_offset: u64,
63    /// Upper bound of the offset range to search (inclusive).
64    pub max_offset: u64,
65    /// Lower bound on the transaction ID of the metadata block holding the
66    /// item (not the transaction that created the item itself).
67    pub min_transid: u64,
68    /// Upper bound on the metadata block transaction ID (inclusive).
69    pub max_transid: u64,
70}
71
72impl SearchKey {
73    /// Return all items of `item_type` in `tree_id`, across every objectid
74    /// and offset.
75    #[must_use]
76    pub fn for_type(tree_id: u64, item_type: u32) -> Self {
77        Self {
78            tree_id,
79            min_objectid: 0,
80            max_objectid: u64::MAX,
81            min_type: item_type,
82            max_type: item_type,
83            min_offset: 0,
84            max_offset: u64::MAX,
85            min_transid: 0,
86            max_transid: u64::MAX,
87        }
88    }
89
90    /// Return all items of `item_type` in `tree_id` whose objectid falls in
91    /// `[min_objectid, max_objectid]`.
92    #[must_use]
93    pub fn for_objectid_range(
94        tree_id: u64,
95        item_type: u32,
96        min_objectid: u64,
97        max_objectid: u64,
98    ) -> Self {
99        Self {
100            min_objectid,
101            max_objectid,
102            ..Self::for_type(tree_id, item_type)
103        }
104    }
105}
106
107/// Metadata returned for each item found by [`tree_search`].
108///
109/// The header fields are in host byte order (the kernel fills them in through
110/// the ioctl layer).  The accompanying `data` slice passed to the callback is
111/// the raw on-disk item payload and is in **little-endian** byte order.
112#[derive(Debug, Clone, Copy)]
113pub struct SearchHeader {
114    /// Transaction ID of the metadata block that contains this item.
115    pub transid: u64,
116    /// Object ID portion of the item's btrfs key.
117    pub objectid: u64,
118    /// Offset portion of the item's btrfs key.
119    pub offset: u64,
120    /// Item type (the `type` field of the btrfs key).
121    pub item_type: u32,
122    /// Length in bytes of the item's data payload.
123    pub len: u32,
124}
125
126/// Number of items to request per ioctl call.
127const ITEMS_PER_BATCH: u32 = 4096;
128
129/// Size of `btrfs_ioctl_search_header` as a compile-time constant.
130const SEARCH_HEADER_SIZE: usize = mem::size_of::<btrfs_ioctl_search_header>();
131
132/// Walk every item in the tree that falls within `key`, calling `f` once for
133/// each one.
134///
135/// `f` receives:
136/// * a reference to the item's [`SearchHeader`] (objectid, offset, type, …)
137/// * a byte slice of the item's raw on-disk data payload (little-endian)
138///
139/// The search stops early and the error is returned if:
140/// * `f` returns `Err(_)`
141/// * the underlying `BTRFS_IOC_TREE_SEARCH` ioctl fails
142///
143/// Returns `Ok(())` when the entire requested range has been scanned.
144///
145/// # Errors
146///
147/// Returns `Err` if the ioctl fails or the callback returns an error.
148///
149/// # Privileges
150///
151/// Most trees require `CAP_SYS_ADMIN`.  The subvolume tree of the inode
152/// belonging to `fd` can be searched without elevated privileges by setting
153/// `key.tree_id = 0`.
154#[allow(clippy::needless_pass_by_value)] // SearchKey is small but not Copy by design
155pub fn tree_search(
156    fd: BorrowedFd,
157    key: SearchKey,
158    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
159) -> nix::Result<()> {
160    let mut args: btrfs_ioctl_search_args = unsafe { mem::zeroed() };
161
162    fill_search_key(&mut args.key, &key);
163
164    loop {
165        args.key.nr_items = ITEMS_PER_BATCH;
166
167        unsafe { btrfs_ioc_tree_search(fd.as_raw_fd(), &raw mut args) }?;
168
169        let nr = args.key.nr_items;
170        if nr == 0 {
171            break;
172        }
173
174        // Walk the result buffer.  We use raw pointer reads to avoid borrow
175        // checker conflicts: args.buf (read-only after the ioctl) and
176        // args.key (mutated below for cursor advancement) are separate fields,
177        // but a Rust reference to one would prevent mutation of the other.
178        //
179        // SAFETY:
180        // * The ioctl has returned successfully, so args.buf contains nr valid
181        //   (header, data) pairs totalling at most args.buf.len() bytes.
182        // * buf_base is derived from args.buf which lives for the entire loop
183        //   body; it is not invalidated until args is dropped.
184        // * All raw reads are bounds-checked before dereferencing.
185        // * The `data` slices passed to `f` do not outlive this loop
186        //   iteration — `f` takes `&[u8]`, not `&'static [u8]`.
187        let buf_base: *const u8 = args.buf.as_ptr().cast();
188        let buf_cap = args.buf.len();
189
190        let mut off = 0usize;
191        let mut last = SearchHeader {
192            transid: 0,
193            objectid: 0,
194            offset: 0,
195            item_type: 0,
196            len: 0,
197        };
198
199        for _ in 0..nr {
200            if off + SEARCH_HEADER_SIZE > buf_cap {
201                return Err(nix::errno::Errno::EOVERFLOW);
202            }
203            let raw_hdr: btrfs_ioctl_search_header = unsafe {
204                buf_base
205                    .add(off)
206                    .cast::<btrfs_ioctl_search_header>()
207                    .read_unaligned()
208            };
209            let hdr = SearchHeader {
210                transid: raw_hdr.transid,
211                objectid: raw_hdr.objectid,
212                offset: raw_hdr.offset,
213                item_type: raw_hdr.type_,
214                len: raw_hdr.len,
215            };
216            off += SEARCH_HEADER_SIZE;
217
218            let data_len = hdr.len as usize;
219            if off + data_len > buf_cap {
220                return Err(nix::errno::Errno::EOVERFLOW);
221            }
222            let data: &[u8] = unsafe {
223                std::slice::from_raw_parts(buf_base.add(off), data_len)
224            };
225            off += data_len;
226
227            f(&hdr, data)?;
228            last = hdr;
229        }
230
231        if !advance_cursor(&mut args.key, &last) {
232            break;
233        }
234    }
235
236    Ok(())
237}
238
239/// Default buffer size for [`tree_search_v2`]: 64 KiB.
240const DEFAULT_V2_BUF_SIZE: usize = 64 * 1024;
241
242/// Like [`tree_search`] but uses `BTRFS_IOC_TREE_SEARCH_V2` with a larger
243/// result buffer.
244///
245/// `buf_size` controls the buffer size in bytes (default 64 KiB if `None`).
246/// The v2 ioctl is otherwise identical to v1 but can return more data per
247/// batch, reducing the number of round-trips for large result sets.
248///
249/// If `buf_size` is too small for even a single item, the kernel returns
250/// `EOVERFLOW` and sets `buf_size` to the required minimum. This function
251/// automatically retries with the larger buffer.
252///
253/// # Errors
254///
255/// Returns `Err` if the ioctl fails or the callback returns an error.
256#[allow(clippy::needless_pass_by_value)] // SearchKey is small but not Copy by design
257#[allow(clippy::too_many_lines)]
258pub fn tree_search_v2(
259    fd: BorrowedFd,
260    key: SearchKey,
261    buf_size: Option<usize>,
262    mut f: impl FnMut(&SearchHeader, &[u8]) -> nix::Result<()>,
263) -> nix::Result<()> {
264    let base_size = mem::size_of::<btrfs_ioctl_search_args_v2>();
265    let mut capacity = buf_size.unwrap_or(DEFAULT_V2_BUF_SIZE);
266
267    // Allocate as Vec<u64> for 8-byte alignment.
268    let alloc_bytes = base_size + capacity;
269    let num_u64s = alloc_bytes.div_ceil(mem::size_of::<u64>());
270    let mut buf = vec![0u64; num_u64s];
271
272    // SAFETY: buf is correctly sized and aligned for btrfs_ioctl_search_args_v2.
273    let args_ptr = buf.as_mut_ptr().cast::<btrfs_ioctl_search_args_v2>();
274    unsafe {
275        fill_search_key(&mut (*args_ptr).key, &key);
276    }
277
278    loop {
279        unsafe {
280            (*args_ptr).key.nr_items = ITEMS_PER_BATCH;
281            (*args_ptr).buf_size = capacity as u64;
282        }
283
284        match unsafe {
285            btrfs_ioc_tree_search_v2(fd.as_raw_fd(), &raw mut *args_ptr)
286        } {
287            Ok(_) => {}
288            Err(nix::errno::Errno::EOVERFLOW) => {
289                // Kernel tells us the needed size via buf_size.
290                #[allow(clippy::cast_possible_truncation)]
291                // buf_size fits in usize
292                let needed = unsafe { (*args_ptr).buf_size } as usize;
293                if needed <= capacity {
294                    return Err(nix::errno::Errno::EOVERFLOW);
295                }
296                capacity = needed;
297                let alloc_bytes = base_size + capacity;
298                let num_u64s = alloc_bytes.div_ceil(mem::size_of::<u64>());
299                buf.resize(num_u64s, 0);
300                // args_ptr must be refreshed after reallocation.
301                let args_ptr_new =
302                    buf.as_mut_ptr().cast::<btrfs_ioctl_search_args_v2>();
303                unsafe {
304                    (*args_ptr_new).key.nr_items = ITEMS_PER_BATCH;
305                    (*args_ptr_new).buf_size = capacity as u64;
306                    btrfs_ioc_tree_search_v2(
307                        fd.as_raw_fd(),
308                        &raw mut *args_ptr_new,
309                    )?;
310                }
311                // Fall through to process results with the new pointer.
312                // Update our local for the rest of the loop.
313                let _ = args_ptr_new;
314            }
315            Err(e) => return Err(e),
316        }
317
318        // Re-derive pointer after potential reallocation.
319        let args_ptr = buf.as_mut_ptr().cast::<btrfs_ioctl_search_args_v2>();
320
321        let nr = unsafe { (*args_ptr).key.nr_items };
322        if nr == 0 {
323            break;
324        }
325
326        // The result data starts right after the base struct (at the
327        // flexible array member `buf[]`).
328        let data_base: *const u8 =
329            unsafe { (args_ptr as *const u8).add(base_size) };
330
331        let mut off = 0usize;
332        let mut last = SearchHeader {
333            transid: 0,
334            objectid: 0,
335            offset: 0,
336            item_type: 0,
337            len: 0,
338        };
339
340        for _ in 0..nr {
341            if off + SEARCH_HEADER_SIZE > capacity {
342                return Err(nix::errno::Errno::EOVERFLOW);
343            }
344            let raw_hdr: btrfs_ioctl_search_header = unsafe {
345                data_base
346                    .add(off)
347                    .cast::<btrfs_ioctl_search_header>()
348                    .read_unaligned()
349            };
350            let hdr = SearchHeader {
351                transid: raw_hdr.transid,
352                objectid: raw_hdr.objectid,
353                offset: raw_hdr.offset,
354                item_type: raw_hdr.type_,
355                len: raw_hdr.len,
356            };
357            off += SEARCH_HEADER_SIZE;
358
359            let data_len = hdr.len as usize;
360            if off + data_len > capacity {
361                return Err(nix::errno::Errno::EOVERFLOW);
362            }
363            let data: &[u8] = unsafe {
364                std::slice::from_raw_parts(data_base.add(off), data_len)
365            };
366            off += data_len;
367
368            f(&hdr, data)?;
369            last = hdr;
370        }
371
372        if !advance_cursor(unsafe { &mut (*args_ptr).key }, &last) {
373            break;
374        }
375    }
376
377    Ok(())
378}
379
380fn fill_search_key(sk: &mut btrfs_ioctl_search_key, key: &SearchKey) {
381    sk.tree_id = key.tree_id;
382    sk.min_objectid = key.min_objectid;
383    sk.max_objectid = key.max_objectid;
384    sk.min_type = key.min_type;
385    sk.max_type = key.max_type;
386    sk.min_offset = key.min_offset;
387    sk.max_offset = key.max_offset;
388    sk.min_transid = key.min_transid;
389    sk.max_transid = key.max_transid;
390}
391
392/// Advance the search cursor past `last` so the next batch begins from the
393/// item immediately after it in the 136-bit key space
394/// `(objectid << 72) | (type << 64) | offset`.
395///
396/// The kernel interprets `(min_objectid, min_type, min_offset)` as a compound
397/// tuple key, so all three fields must be updated together to point past the
398/// last returned item.  Advancing only `min_offset` while leaving
399/// `min_objectid` at its original value would cause items from lower objectids
400/// that were already returned to satisfy the new minimum and be yielded again.
401///
402/// Returns `false` when the objectid also overflows, meaning the full key
403/// space has been exhausted.
404fn advance_cursor(
405    sk: &mut btrfs_ioctl_search_key,
406    last: &SearchHeader,
407) -> bool {
408    let (new_offset, offset_overflow) = last.offset.overflowing_add(1);
409    if !offset_overflow {
410        sk.min_objectid = last.objectid;
411        sk.min_type = last.item_type;
412        sk.min_offset = new_offset;
413        return true;
414    }
415
416    sk.min_offset = 0;
417    let (new_type, type_overflow) = last.item_type.overflowing_add(1);
418    if !type_overflow {
419        sk.min_objectid = last.objectid;
420        sk.min_type = new_type;
421        return true;
422    }
423
424    sk.min_type = 0;
425    let (new_oid, oid_overflow) = last.objectid.overflowing_add(1);
426    if oid_overflow {
427        return false;
428    }
429    sk.min_objectid = new_oid;
430    true
431}
432
433#[cfg(test)]
434mod tests {
435    use super::*;
436
437    fn header(objectid: u64, item_type: u32, offset: u64) -> SearchHeader {
438        SearchHeader {
439            transid: 0,
440            objectid,
441            offset,
442            item_type,
443            len: 0,
444        }
445    }
446
447    fn zeroed_search_key() -> btrfs_ioctl_search_key {
448        unsafe { mem::zeroed() }
449    }
450
451    // --- SearchKey constructors ---
452
453    #[test]
454    fn for_type_covers_all_objectids_and_offsets() {
455        let sk = SearchKey::for_type(5, 132);
456        assert_eq!(sk.tree_id, 5);
457        assert_eq!(sk.min_objectid, 0);
458        assert_eq!(sk.max_objectid, u64::MAX);
459        assert_eq!(sk.min_type, 132);
460        assert_eq!(sk.max_type, 132);
461        assert_eq!(sk.min_offset, 0);
462        assert_eq!(sk.max_offset, u64::MAX);
463        assert_eq!(sk.min_transid, 0);
464        assert_eq!(sk.max_transid, u64::MAX);
465    }
466
467    #[test]
468    fn for_objectid_range_restricts_objectids() {
469        let sk = SearchKey::for_objectid_range(1, 84, 100, 200);
470        assert_eq!(sk.tree_id, 1);
471        assert_eq!(sk.min_objectid, 100);
472        assert_eq!(sk.max_objectid, 200);
473        assert_eq!(sk.min_type, 84);
474        assert_eq!(sk.max_type, 84);
475        assert_eq!(sk.min_offset, 0);
476        assert_eq!(sk.max_offset, u64::MAX);
477    }
478
479    // --- fill_search_key ---
480
481    #[test]
482    fn fill_search_key_copies_all_fields() {
483        let key = SearchKey {
484            tree_id: 1,
485            min_objectid: 10,
486            max_objectid: 20,
487            min_type: 30,
488            max_type: 40,
489            min_offset: 50,
490            max_offset: 60,
491            min_transid: 70,
492            max_transid: 80,
493        };
494        let mut sk = zeroed_search_key();
495        fill_search_key(&mut sk, &key);
496        assert_eq!(sk.tree_id, 1);
497        assert_eq!(sk.min_objectid, 10);
498        assert_eq!(sk.max_objectid, 20);
499        assert_eq!(sk.min_type, 30);
500        assert_eq!(sk.max_type, 40);
501        assert_eq!(sk.min_offset, 50);
502        assert_eq!(sk.max_offset, 60);
503        assert_eq!(sk.min_transid, 70);
504        assert_eq!(sk.max_transid, 80);
505    }
506
507    // --- advance_cursor: normal case ---
508
509    #[test]
510    fn advance_increments_offset() {
511        let mut sk = zeroed_search_key();
512        let last = header(256, 132, 100);
513        assert!(advance_cursor(&mut sk, &last));
514        assert_eq!(sk.min_objectid, 256);
515        assert_eq!(sk.min_type, 132);
516        assert_eq!(sk.min_offset, 101);
517    }
518
519    #[test]
520    fn advance_tracks_objectid_from_last_item() {
521        // This is the bug that was fixed: the cursor must track the last
522        // item's objectid, not leave min_objectid at its original value.
523        let mut sk = zeroed_search_key();
524        sk.min_objectid = 100; // original search started at 100
525        let last = header(300, 132, 50); // batch ended at objectid 300
526        assert!(advance_cursor(&mut sk, &last));
527        assert_eq!(sk.min_objectid, 300, "must track last item's objectid");
528        assert_eq!(sk.min_type, 132);
529        assert_eq!(sk.min_offset, 51);
530    }
531
532    #[test]
533    fn advance_tracks_type_from_last_item() {
534        let mut sk = zeroed_search_key();
535        let last = header(256, 180, 42);
536        assert!(advance_cursor(&mut sk, &last));
537        assert_eq!(sk.min_objectid, 256);
538        assert_eq!(sk.min_type, 180);
539        assert_eq!(sk.min_offset, 43);
540    }
541
542    // --- advance_cursor: offset overflow ---
543
544    #[test]
545    fn advance_offset_overflow_bumps_type() {
546        let mut sk = zeroed_search_key();
547        let last = header(256, 132, u64::MAX);
548        assert!(advance_cursor(&mut sk, &last));
549        assert_eq!(sk.min_objectid, 256);
550        assert_eq!(sk.min_type, 133);
551        assert_eq!(sk.min_offset, 0);
552    }
553
554    // --- advance_cursor: type overflow ---
555
556    #[test]
557    fn advance_type_overflow_bumps_objectid() {
558        let mut sk = zeroed_search_key();
559        let last = header(256, u32::MAX, u64::MAX);
560        assert!(advance_cursor(&mut sk, &last));
561        assert_eq!(sk.min_objectid, 257);
562        assert_eq!(sk.min_type, 0);
563        assert_eq!(sk.min_offset, 0);
564    }
565
566    // --- advance_cursor: full keyspace exhaustion ---
567
568    #[test]
569    fn advance_all_overflow_returns_false() {
570        let mut sk = zeroed_search_key();
571        let last = header(u64::MAX, u32::MAX, u64::MAX);
572        assert!(!advance_cursor(&mut sk, &last));
573    }
574
575    // --- advance_cursor: edge cases ---
576
577    #[test]
578    fn advance_zero_key() {
579        let mut sk = zeroed_search_key();
580        let last = header(0, 0, 0);
581        assert!(advance_cursor(&mut sk, &last));
582        assert_eq!(sk.min_objectid, 0);
583        assert_eq!(sk.min_type, 0);
584        assert_eq!(sk.min_offset, 1);
585    }
586
587    #[test]
588    fn advance_objectid_max_type_zero_offset_max() {
589        // offset overflows, type bumps to 1
590        let mut sk = zeroed_search_key();
591        let last = header(u64::MAX, 0, u64::MAX);
592        assert!(advance_cursor(&mut sk, &last));
593        assert_eq!(sk.min_objectid, u64::MAX);
594        assert_eq!(sk.min_type, 1);
595        assert_eq!(sk.min_offset, 0);
596    }
597
598    #[test]
599    fn advance_preserves_unrelated_search_key_fields() {
600        let mut sk = zeroed_search_key();
601        sk.max_objectid = 999;
602        sk.max_type = 888;
603        sk.max_offset = 777;
604        sk.max_transid = 666;
605        let last = header(10, 20, 30);
606        advance_cursor(&mut sk, &last);
607        assert_eq!(sk.max_objectid, 999);
608        assert_eq!(sk.max_type, 888);
609        assert_eq!(sk.max_offset, 777);
610        assert_eq!(sk.max_transid, 666);
611    }
612}