btrfs-transaction 0.13.0

//! # Transaction lifecycle: start, commit, abort
//!
//! A [`Transaction`] groups multiple tree modifications into a
//! single atomic commit. The commit point is the superblock write:
//! all new tree blocks are written first (at new locations via
//! COW), then the superblock is updated to point to the new root.
//!
//! ## Lifecycle
//!
//! - [`Transaction::start`]: bumps the in-memory generation,
//!   snapshots the current root pointers, and seeds the metadata
//!   block-group cursor.
//! - [`Transaction::commit`]: force-COWs the root tree (so every
//!   commit advances `header.generation`), runs a convergence loop
//!   draining delayed refs, root-item updates, and FST updates
//!   until stable, flushes every dirty block to disk via the chunk
//!   tree (writing all DUP / RAID1 / RAID0 / RAID10 / RAID5 /
//!   RAID6 stripes per the chunk cache's `plan_write`), updates
//!   superblock fields and the rotating backup roots, and writes
//!   the superblock to all mirrors.
//! - [`Transaction::abort`]: restores the in-memory root-pointer
//!   snapshot so the next transaction reads consistent on-disk
//!   state.
//!
//! ## High-level helpers
//!
//! Most callers don't drive `search_slot` + `insert_item` directly
//! — this module also provides higher-level builders that compose
//! several primitive operations:
//!
//! - **Block / extent allocation**:
//!   [`Transaction::alloc_block`],
//!   [`Transaction::alloc_tree_block`],
//!   [`Transaction::alloc_data_extent`],
//!   [`Transaction::reserve_data_extent`].
//! - **File data**: [`Transaction::insert_file_extent`],
//!   [`Transaction::insert_inline_extent`],
//!   [`Transaction::insert_csums`],
//!   [`Transaction::write_file_data`],
//!   [`Transaction::update_inode_nbytes`].
//! - **Inodes / directory entries**:
//!   [`Transaction::create_inode`] (with
//!   [`crate::inode::InodeArgs`]),
//!   [`Transaction::link_dir_entry`],
//!   [`Transaction::link_subvol_entry`],
//!   [`Transaction::set_xattr`],
//!   [`Transaction::set_inode_nlink`].
//! - **Subvolume / root-tree management**:
//!   [`Transaction::create_empty_tree`],
//!   [`Transaction::insert_root_ref`],
//!   [`Transaction::set_root_readonly`],
//!   [`Transaction::set_default_subvol`],
//!   [`Transaction::set_device_total_bytes`].
//! - **Recovery**: [`Transaction::rebuild_chunk_tree`] (used by
//!   `rescue chunk-recover --apply`).
//! - **Free-form compression** (free functions, also exported):
//!   [`try_compress`] (inline), [`try_compress_regular`]
//!   (per-sector LZO framing).

use crate::{
    allocation,
    buffer::{ExtentBuffer, HEADER_SIZE, ITEM_SIZE},
    cow::cow_block,
    delayed_ref::{DelayedRefKey, DelayedRefQueue},
    filesystem::Filesystem,
    free_space::{BlockGroupRangeDeltas, Range},
    items,
    path::BtrfsPath,
    search::{self, SearchIntent},
};
use btrfs_disk::{
    chunk::{
        chunk_item_bytes, parse_chunk_item, sys_chunk_array_append,
        sys_chunk_array_contains,
    },
    items::{BlockGroupFlags, ExtentItem, RootItem},
    tree::{DiskKey, KeyType},
};
use std::{
    collections::{BTreeMap, BTreeSet},
    io::{self, Read, Seek, Write},
};

/// Handle for an in-progress transaction.
///
/// Created by [`Transaction::start`], which increments the generation.
/// Tracks dirty blocks and pending reference count changes. Finalized by
/// either [`commit`](Transaction::commit) (write to disk) or
/// [`abort`](Transaction::abort) (discard).
/// Block group kind that the transaction allocator can target.
///
/// Metadata block groups hold all tree blocks except the chunk tree;
/// SYSTEM block groups hold the chunk tree itself, so its blocks can be
/// resolved by the early-mount bootstrap via the superblock's
/// `sys_chunk_array`. DATA block groups hold file data extents, which
/// are written directly (not COW'd through the tree-block pipeline).
#[derive(Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Debug)]
pub enum BlockGroupKind {
    /// A metadata block group (tree blocks for trees other than the
    /// chunk tree).
    Metadata,
    /// A SYSTEM block group (used exclusively for chunk tree blocks).
    System,
    /// A DATA block group (used for file data extents).
    Data,
}

/// Bump allocator state for one [`BlockGroupKind`].
#[derive(Debug, Clone, Copy)]
struct AllocCursor {
    cursor: u64,
    end: u64,
}

pub struct Transaction<R> {
    /// The transaction generation (superblock.generation + 1).
    pub transid: u64,
    /// Blocks freed during this transaction (old COW sources).
    freed_blocks: Vec<u64>,
    /// Blocks allocated during this transaction (for free space tree updates).
    allocated_blocks: Vec<u64>,
    /// Delayed reference count updates.
    pub delayed_refs: DelayedRefQueue,
    /// Per-block-group byte ranges allocated and freed during this
    /// transaction. Populated by `flush_delayed_refs`. Consumed by the
    /// free space tree update step (Stage F3). Cleared at commit end.
    pub bg_range_deltas: BlockGroupRangeDeltas,
    /// Per-kind bump allocator state. Lazily populated on first use of
    /// each kind so that filesystems without the relevant block group
    /// type pay no scanning cost up front.
    alloc: BTreeMap<BlockGroupKind, AllocCursor>,
    /// Logical addresses of blocks freed during this transaction. These
    /// must not be reallocated before the superblock is committed, because
    /// the previous superblock still references them. A crash before commit
    /// would leave both old and new data at the same address.
    pinned: BTreeSet<u64>,
    /// Phantom to tie the lifetime/type parameter.
    _phantom: std::marker::PhantomData<R>,
}

impl<R: Read + Write + Seek> Transaction<R> {
    /// Start a new transaction.
    ///
    /// Increments the filesystem generation by 1 and initializes the
    /// temporary block allocator by scanning for a metadata block group.
    ///
    /// # Errors
    ///
    /// Returns an error if the filesystem state cannot be prepared.
    pub fn start(fs_info: &mut Filesystem<R>) -> io::Result<Self> {
        let transid = fs_info.superblock.generation + 1;
        // Generation must advance monotonically.
        assert!(
            transid > fs_info.superblock.generation,
            "start: transid {transid} did not advance beyond superblock \
             generation {}",
            fs_info.superblock.generation,
        );
        fs_info.generation = transid;

        // Snapshot current roots so we can detect changes at commit time
        fs_info.snapshot_roots();

        // Eagerly seed the metadata cursor — every transaction COWs at
        // least one metadata block, so failing here is the same as
        // failing on the first alloc. The SYSTEM cursor is created on
        // demand the first time the chunk tree is COWed.
        let nodesize = u64::from(fs_info.nodesize);
        let (cursor, end) = find_alloc_region_after(
            fs_info,
            BlockGroupKind::Metadata,
            0,
            nodesize,
            nodesize,
        )?;
        let mut alloc = BTreeMap::new();
        alloc.insert(BlockGroupKind::Metadata, AllocCursor { cursor, end });

        Ok(Self {
            transid,
            freed_blocks: Vec::new(),
            allocated_blocks: Vec::new(),
            delayed_refs: DelayedRefQueue::new(),
            bg_range_deltas: BlockGroupRangeDeltas::new(),
            alloc,
            pinned: BTreeSet::new(),
            _phantom: std::marker::PhantomData,
        })
    }

    /// Allocate a new tree block (nodesize bytes) inside a block group
    /// of `kind`.
    ///
    /// Uses a per-kind bump allocator within a free extent. If the
    /// current region is exhausted, scans the extent tree for another
    /// free extent of the requested kind and continues from there.
    ///
    /// # Errors
    ///
    /// Returns an error if no block group of the requested kind has
    /// enough free space.
    pub fn alloc_block(
        &mut self,
        fs_info: &mut Filesystem<R>,
        kind: BlockGroupKind,
    ) -> io::Result<u64> {
        let nodesize = u64::from(fs_info.nodesize);

        // Lazily seed a cursor for this kind on first use.
        #[allow(clippy::map_entry)]
        if !self.alloc.contains_key(&kind) {
            let (cursor, end) =
                find_alloc_region_after(fs_info, kind, 0, nodesize, nodesize)?;
            self.alloc.insert(kind, AllocCursor { cursor, end });
        }

        loop {
            // Snapshot current cursor; we mutate self.alloc below so we
            // can't hold a borrow into it across the find call.
            let mut state = *self.alloc.get(&kind).unwrap();

            if state.cursor + nodesize > state.end {
                // Current region exhausted — find another free extent.
                let (cursor, end) = find_alloc_region_after(
                    fs_info,
                    kind,
                    state.cursor,
                    nodesize,
                    nodesize,
                )?;
                state = AllocCursor { cursor, end };
                if state.cursor + nodesize > state.end {
                    return Err(io::Error::other(format!(
                        "no {kind:?} block group with enough free space",
                    )));
                }
            }

            let logical = state.cursor;
            state.cursor += nodesize;
            self.alloc.insert(kind, state);

            // Skip pinned blocks: these were freed during this transaction
            // but the old superblock still references them. Reusing them
            // before commit would break crash consistency.
            if self.pinned.contains(&logical) {
                continue;
            }

            // Sanity: we should never allocate a pinned address
            // (the pinned check above should have caught it).
            debug_assert!(
                !self.pinned.contains(&logical),
                "alloc_block: allocated pinned address {logical:#x}",
            );
            // The address must be nodesize-aligned.
            debug_assert_eq!(
                logical % u64::from(fs_info.nodesize),
                0,
                "alloc_block: address {logical:#x} not aligned to nodesize {}",
                fs_info.nodesize,
            );
            self.allocated_blocks.push(logical);
            return Ok(logical);
        }
    }

    /// Allocate a new tree block and queue a delayed ref for it.
    ///
    /// Routes the allocation to a SYSTEM block group when COW'ing the
    /// chunk tree (tree id 3) and to a metadata block group otherwise.
    /// SYSTEM allocations are immediately registered in the
    /// superblock's `sys_chunk_array` so the next mount can resolve
    /// them via the bootstrap snippet.
    ///
    /// # Errors
    ///
    /// Returns an error if no free metadata space is available, or if
    /// a SYSTEM allocation cannot be added to the bootstrap snippet.
    pub fn alloc_tree_block(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        level: u8,
    ) -> io::Result<u64> {
        let kind = if tree_id
            == u64::from(btrfs_disk::raw::BTRFS_CHUNK_TREE_OBJECTID)
        {
            BlockGroupKind::System
        } else {
            BlockGroupKind::Metadata
        };
        let logical = self.alloc_block(fs_info, kind)?;
        self.delayed_refs.add_ref(logical, true, tree_id, level);
        if kind == BlockGroupKind::System {
            self.ensure_in_sys_chunk_array(fs_info, logical)?;
        }
        Ok(logical)
    }

    /// Allocate a data extent: find space in a DATA block group, write
    /// `data` to disk immediately, queue a `+1` `EXTENT_DATA_REF` delayed
    /// ref, and return the allocated logical address.
    ///
    /// `data` is zero-padded up to the next sectorsize boundary before
    /// being written. The returned address is sectorsize-aligned and the
    /// queued ref's `num_bytes` is the padded size.
    ///
    /// Unlike tree-block allocations, data extents are written to disk
    /// at allocation time (`BlockReader::write_block` routes to all
    /// stripe copies). Only the metadata (`EXTENT_ITEM`, `EXTENT_DATA`,
    /// `EXTENT_CSUM`) goes through the commit pipeline.
    ///
    /// # Errors
    ///
    /// Returns an error if no DATA block group has enough free space,
    /// or if the disk write fails.
    pub fn alloc_data_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        data: &[u8],
        owner_root: u64,
        owner_ino: u64,
        owner_offset: u64,
    ) -> io::Result<u64> {
        let sectorsize = u64::from(fs_info.sectorsize);
        let raw_len = data.len() as u64;
        let aligned_size = align_up(raw_len, sectorsize);
        if aligned_size == 0 {
            return Err(io::Error::other(
                "alloc_data_extent: empty data not supported",
            ));
        }

        let logical = self.reserve_data_extent(
            fs_info,
            aligned_size,
            owner_root,
            owner_ino,
            owner_offset,
        )?;

        // Write the data to disk now (zero-padded to sector alignment).
        // BlockReader::write_block fans out to all stripe copies.
        if raw_len == aligned_size {
            fs_info.reader_mut().write_block(logical, data)?;
        } else {
            let mut padded = Vec::with_capacity(aligned_size as usize);
            padded.extend_from_slice(data);
            padded.resize(aligned_size as usize, 0);
            fs_info.reader_mut().write_block(logical, &padded)?;
        }

        Ok(logical)
    }

    /// Reserve a sector-aligned data extent without writing any
    /// bytes. Returns the allocated logical address. Queues the
    /// `+1 EXTENT_DATA_REF` delayed ref exactly like
    /// [`alloc_data_extent`](Self::alloc_data_extent), so subsequent commit machinery
    /// produces the matching `EXTENT_ITEM` and FST entries; the
    /// caller is responsible for placing actual content at the
    /// returned address before commit (e.g. via
    /// [`BlockReader::write_block`](btrfs_disk::reader::BlockReader::write_block),
    /// or — for `mkfs --rootdir --reflink` — `FICLONERANGE` from a
    /// source file into each stripe's backing device file).
    ///
    /// `aligned_size` must be `> 0` and a multiple of `sectorsize`.
    /// Unlike [`alloc_data_extent`](Self::alloc_data_extent), this function never zero-pads
    /// or rounds the size up — the caller knows the exact extent
    /// length up front.
    ///
    /// # Errors
    ///
    /// Returns an error if `aligned_size` is zero or misaligned, or
    /// if no DATA block group has enough free contiguous space.
    pub fn reserve_data_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        aligned_size: u64,
        owner_root: u64,
        owner_ino: u64,
        owner_offset: u64,
    ) -> io::Result<u64> {
        let sectorsize = u64::from(fs_info.sectorsize);
        if aligned_size == 0 {
            return Err(io::Error::other(
                "reserve_data_extent: aligned_size must be > 0",
            ));
        }
        if !aligned_size.is_multiple_of(sectorsize) {
            return Err(io::Error::other(format!(
                "reserve_data_extent: aligned_size {aligned_size} not a \
                 multiple of sectorsize {sectorsize}"
            )));
        }

        // Find a region with enough contiguous free space. Same cursor
        // logic as `alloc_data_extent` (and `alloc_block`).
        let kind = BlockGroupKind::Data;
        #[allow(clippy::map_entry)]
        if !self.alloc.contains_key(&kind) {
            let (cursor, end) = find_alloc_region_after(
                fs_info,
                kind,
                0,
                sectorsize,
                aligned_size,
            )?;
            self.alloc.insert(kind, AllocCursor { cursor, end });
        }

        let mut state = *self.alloc.get(&kind).unwrap();
        if state.cursor + aligned_size > state.end {
            let (cursor, end) = find_alloc_region_after(
                fs_info,
                kind,
                state.cursor,
                sectorsize,
                aligned_size,
            )?;
            state = AllocCursor { cursor, end };
            if state.cursor + aligned_size > state.end {
                return Err(io::Error::other(
                    "no DATA block group with enough free space",
                ));
            }
        }

        let logical = state.cursor;
        state.cursor += aligned_size;
        self.alloc.insert(kind, state);

        debug_assert_eq!(
            logical % sectorsize,
            0,
            "reserve_data_extent: address {logical:#x} not aligned to \
             sectorsize {sectorsize}",
        );

        self.delayed_refs.add_data_ref(
            logical,
            aligned_size,
            owner_root,
            owner_ino,
            owner_offset,
            1,
        );

        self.allocated_blocks.push(logical);
        Ok(logical)
    }

    /// Insert an `EXTENT_DATA` item into an FS tree.
    ///
    /// `extent_data` is the already-serialized payload (use
    /// [`FileExtentItem::to_bytes_regular`](btrfs_disk::items::FileExtentItem::to_bytes_regular)
    /// or
    /// [`FileExtentItem::to_bytes_inline`](btrfs_disk::items::FileExtentItem::to_bytes_inline)).
    ///
    /// The key is `(inode, EXTENT_DATA, file_offset)`. For inline extents
    /// the caller passes `file_offset = 0` per the on-disk convention.
    ///
    /// # Errors
    ///
    /// Returns an error if an `EXTENT_DATA` item already exists at the
    /// target key, or if any tree operation fails. Updating an existing
    /// extent (the COW write path) is the caller's responsibility: drop
    /// the old extent first, then insert the new one.
    pub fn insert_file_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        file_offset: u64,
        extent_data: &[u8],
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: inode,
            key_type: KeyType::ExtentData,
            offset: file_offset,
        };

        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            tree_id,
            &key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + extent_data.len()) as u32),
            true,
        )?;
        if found {
            path.release();
            return Err(io::Error::other(format!(
                "insert_file_extent: EXTENT_DATA already exists at \
                 (ino={inode}, offset={file_offset}) in tree {tree_id}"
            )));
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("insert_file_extent: no leaf in path")
        })?;
        let slot = path.slots[0];
        items::insert_item(leaf, slot, &key, extent_data)?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Compute per-sector CRC32C checksums of `on_disk_data` and insert
    /// them into the csum tree (tree id 7).
    ///
    /// `on_disk_data` is the data as it lands on disk — for compressed
    /// extents that means the compressed/framed payload, not the
    /// uncompressed original. Length must be a multiple of `sectorsize`.
    ///
    /// Each sector contributes a 4-byte CRC32C (standard ISO 3309) to a
    /// `EXTENT_CSUM` item keyed `(EXTENT_CSUM_OBJECTID, EXTENT_CSUM,
    /// logical_bytenr)`. Large extents are split into multiple csum
    /// items so each fits in a single leaf.
    ///
    /// This call does not merge with adjacent existing csum items;
    /// callers that write contiguous extents in the same transaction
    /// will produce one item per call. `btrfs check` accepts either
    /// shape.
    ///
    /// # Errors
    ///
    /// Returns an error if the filesystem's `csum_type` is not CRC32C,
    /// if `on_disk_data.len()` is not sectorsize-aligned, or if any
    /// tree operation fails.
    pub fn insert_csums(
        &mut self,
        fs_info: &mut Filesystem<R>,
        logical_bytenr: u64,
        on_disk_data: &[u8],
    ) -> io::Result<()> {
        use btrfs_disk::superblock::ChecksumType;

        if fs_info.superblock.csum_type != ChecksumType::Crc32 {
            return Err(io::Error::other(format!(
                "insert_csums: only CRC32C is supported (csum_type = {:?})",
                fs_info.superblock.csum_type,
            )));
        }

        let sectorsize = u64::from(fs_info.sectorsize);
        let total = on_disk_data.len() as u64;
        if total == 0 || !total.is_multiple_of(sectorsize) {
            return Err(io::Error::other(format!(
                "insert_csums: on_disk_data length {total} not a multiple of \
                 sectorsize {sectorsize}",
            )));
        }
        let csum_size: usize = 4;

        // Compute per-sector csums up front.
        let num_sectors = (total / sectorsize) as usize;
        let mut all_csums = Vec::with_capacity(num_sectors * csum_size);
        for sector in on_disk_data.chunks_exact(sectorsize as usize) {
            let csum = crc32c::crc32c(sector);
            all_csums.extend_from_slice(&csum.to_le_bytes());
        }

        // Cap each csum item so it (plus a second item header to leave
        // room for a future split) fits comfortably in a leaf.
        let leaf_data_size = (fs_info.nodesize as usize) - HEADER_SIZE;
        let max_payload =
            leaf_data_size.saturating_sub(2 * ITEM_SIZE) - csum_size;
        let max_csums_per_item = (max_payload / csum_size).max(1);

        let csum_objectid =
            i64::from(btrfs_disk::raw::BTRFS_EXTENT_CSUM_OBJECTID) as u64;

        let mut sector_idx = 0usize;
        while sector_idx < num_sectors {
            let take = (num_sectors - sector_idx).min(max_csums_per_item);
            let payload_start = sector_idx * csum_size;
            let payload_end = payload_start + take * csum_size;
            let payload = &all_csums[payload_start..payload_end];

            let chunk_logical =
                logical_bytenr + (sector_idx as u64) * sectorsize;
            let key = DiskKey {
                objectid: csum_objectid,
                key_type: KeyType::ExtentCsum,
                offset: chunk_logical,
            };

            let mut path = BtrfsPath::new();
            let found = search::search_slot(
                Some(&mut *self),
                fs_info,
                7, // csum tree
                &key,
                &mut path,
                SearchIntent::Insert((ITEM_SIZE + payload.len()) as u32),
                true,
            )?;
            if found {
                path.release();
                return Err(io::Error::other(format!(
                    "insert_csums: csum item already exists at {chunk_logical}"
                )));
            }
            let leaf = path.nodes[0].as_mut().ok_or_else(|| {
                io::Error::other("insert_csums: no leaf in path")
            })?;
            let slot = path.slots[0];
            items::insert_item(leaf, slot, &key, payload)?;
            fs_info.mark_dirty(leaf);
            path.release();

            sector_idx += take;
        }

        Ok(())
    }

    /// Adjust an inode's `nbytes` field by `delta` bytes.
    ///
    /// Reads the inode's `INODE_ITEM` at `(inode, INODE_ITEM, 0)` in
    /// `tree_id`, patches the 8-byte `nbytes` field in place, and marks
    /// the leaf dirty. All other inode fields are preserved verbatim
    /// (in-place patching avoids round-tripping fields not modeled by
    /// `InodeItemArgs`, e.g. `flags`, `rdev`, `sequence`).
    ///
    /// `delta` is signed: positive grows `nbytes` (write/append), negative
    /// shrinks it (truncate/COW drop). The result must not underflow.
    ///
    /// # Errors
    ///
    /// Returns an error if the `INODE_ITEM` is missing, if its payload is
    /// shorter than the field offset, or if applying `delta` would
    /// underflow.
    pub fn update_inode_nbytes(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        delta: i64,
    ) -> io::Result<()> {
        if delta == 0 {
            return Ok(());
        }

        let nbytes_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, nbytes);

        let key = DiskKey {
            objectid: inode,
            key_type: KeyType::InodeItem,
            offset: 0,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            tree_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "update_inode_nbytes: INODE_ITEM missing for inode {inode} in \
                 tree {tree_id}"
            )));
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("update_inode_nbytes: no leaf in path")
        })?;
        let slot = path.slots[0];
        let item_len = leaf.item_size(slot) as usize;
        if item_len < nbytes_off + 8 {
            path.release();
            return Err(io::Error::other(format!(
                "update_inode_nbytes: INODE_ITEM payload {item_len} bytes < {}",
                nbytes_off + 8,
            )));
        }

        let payload = leaf.item_data_mut(slot);
        let mut current = u64::from_le_bytes(
            payload[nbytes_off..nbytes_off + 8].try_into().unwrap(),
        );
        if delta < 0 {
            let abs = (-delta) as u64;
            current = current.checked_sub(abs).ok_or_else(|| {
                io::Error::other(format!(
                    "update_inode_nbytes: underflow (current {current}, delta {delta})"
                ))
            })?;
        } else {
            current = current.checked_add(delta as u64).ok_or_else(|| {
                io::Error::other("update_inode_nbytes: overflow")
            })?;
        }
        payload[nbytes_off..nbytes_off + 8]
            .copy_from_slice(&current.to_le_bytes());
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Set an inode's `nlink` field to the absolute value `nlink`.
    ///
    /// Reads the inode's `INODE_ITEM` at `(inode, INODE_ITEM, 0)` in
    /// `tree_id`, patches the 4-byte `nlink` field in place, and marks
    /// the leaf dirty. All other inode fields are preserved verbatim
    /// (in-place patching avoids round-tripping `flags`, `rdev`,
    /// `sequence`, etc.).
    ///
    /// Useful when hardlink counts are only known after the directory
    /// walk completes. The transaction-side equivalent of
    /// `update_inode_nbytes` for an absolute `u32` value.
    ///
    /// # Errors
    ///
    /// Returns an error if the `INODE_ITEM` is missing, or if its
    /// payload is shorter than the field offset.
    pub fn set_inode_nlink(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        nlink: u32,
    ) -> io::Result<()> {
        let nlink_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, nlink);

        let key = DiskKey {
            objectid: inode,
            key_type: KeyType::InodeItem,
            offset: 0,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            tree_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "set_inode_nlink: INODE_ITEM missing for inode {inode} in \
                 tree {tree_id}"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("set_inode_nlink: no leaf in path")
        })?;
        let slot = path.slots[0];
        let item_len = leaf.item_size(slot) as usize;
        if item_len < nlink_off + 4 {
            path.release();
            return Err(io::Error::other(format!(
                "set_inode_nlink: INODE_ITEM payload {item_len} bytes < {}",
                nlink_off + 4,
            )));
        }
        let payload = leaf.item_data_mut(slot);
        payload[nlink_off..nlink_off + 4].copy_from_slice(&nlink.to_le_bytes());
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Insert an inline `EXTENT_DATA` item that embeds `data` directly
    /// in the FS tree leaf, and bump the inode's `nbytes` by
    /// `data.len()`.
    ///
    /// `data` is the *uncompressed* file content. When `compression` is
    /// `Some(algorithm)`, the function attempts to compress `data` and
    /// embeds the compressed bytes if they shrink (the on-disk
    /// `compression` byte reflects the actual algorithm used; if
    /// compression doesn't shrink, the raw bytes are embedded with
    /// `compression = None`). LZO uses the inline single-segment
    /// framing format produced by [`try_compress`].
    ///
    /// Inline extents have no separate data extent, no extent-tree
    /// entry, and no csum entries. They are the canonical
    /// representation for small files (size below
    /// [`max_inline_data_size`]).
    ///
    /// Per the on-disk format the key offset for an inline extent is
    /// always 0; this function rejects any other `file_offset`.
    ///
    /// # Errors
    ///
    /// Returns an error if `file_offset != 0`, if `data` is empty or
    /// the bytes that would be embedded exceed
    /// `max_inline_data_size(sectorsize, nodesize)`, or if any tree
    /// operation fails.
    pub fn insert_inline_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        file_offset: u64,
        data: &[u8],
        compression: Option<btrfs_disk::items::CompressionType>,
    ) -> io::Result<()> {
        use btrfs_disk::items::{CompressionType, FileExtentItem};

        if file_offset != 0 {
            return Err(io::Error::other(format!(
                "insert_inline_extent: file_offset must be 0, got {file_offset}"
            )));
        }
        if data.is_empty() {
            return Err(io::Error::other(
                "insert_inline_extent: empty data not supported",
            ));
        }

        // Try compression if requested. Fall back to raw if it doesn't
        // shrink (or for unsupported algorithms like LZO, which uses a
        // distinct inline framing format).
        let (embed, comp_byte) = match compression {
            Some(ct) => match try_compress(data, ct) {
                Some(c) => (c, ct),
                None => (data.to_vec(), CompressionType::None),
            },
            None => (data.to_vec(), CompressionType::None),
        };

        let max = max_inline_data_size(fs_info.sectorsize, fs_info.nodesize);
        if embed.len() > max {
            return Err(io::Error::other(format!(
                "insert_inline_extent: payload {} bytes exceeds inline limit {max}",
                embed.len(),
            )));
        }

        let extent_data = FileExtentItem::to_bytes_inline(
            self.transid,
            data.len() as u64,
            comp_byte,
            &embed,
        );
        self.insert_file_extent(fs_info, tree_id, inode, 0, &extent_data)?;
        // For inline extents, INODE.nbytes accounts for the
        // *uncompressed* inline byte count (the bytes the file logically
        // contains, not what's stored in the leaf).
        self.update_inode_nbytes(fs_info, tree_id, inode, data.len() as i64)?;
        Ok(())
    }

    /// Write `data` as the file content of `inode` in `tree_id` starting
    /// at `file_offset`. The inode's `INODE_ITEM` must already exist.
    ///
    /// Splits `data` into extents of at most 1 MiB each. For each chunk:
    ///
    /// 1. Optionally compress the chunk with the algorithm in
    ///    `compression` (per-chunk fallback to raw if compression
    ///    doesn't shrink the bytes).
    /// 2. Allocate a data extent via [`alloc_data_extent`](Self::alloc_data_extent)
    ///    for the bytes that will land on disk (compressed or raw,
    ///    zero-padded to sectorsize).
    /// 3. Insert an `EXTENT_DATA` item via
    ///    [`insert_file_extent`](Self::insert_file_extent). For
    ///    compressed chunks `disk_num_bytes` is the aligned compressed
    ///    size; `num_bytes`/`ram_bytes` are the aligned logical size.
    /// 4. If `nodatasum == false`, insert per-sector CRC32C csums over
    ///    the on-disk (compressed+padded) bytes via
    ///    [`insert_csums`](Self::insert_csums).
    /// 5. Bump the inode's `nbytes` by the chunk's aligned logical size
    ///    (not the on-disk size — `INODE.nbytes` is logical).
    ///
    /// Files at `file_offset == 0` whose `data.len()` fits in
    /// [`max_inline_data_size`] are stored inline via
    /// [`insert_inline_extent`](Self::insert_inline_extent).
    ///
    /// `compression` selects the algorithm to attempt:
    /// `Some(Zlib | Zstd | Lzo)` are all supported (LZO uses
    /// [`try_compress_regular`]'s per-sector framing format). `None`
    /// skips the compression attempt entirely.
    ///
    /// The file's logical size (`INODE_ITEM.size`) is the caller's
    /// responsibility — `write_file_data` only adjusts `nbytes`.
    ///
    /// # Errors
    ///
    /// Returns an error if `data` is empty, if any allocation/insert
    /// fails, or if the inode's `INODE_ITEM` is missing.
    #[allow(clippy::too_many_arguments)]
    pub fn write_file_data(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        file_offset: u64,
        data: &[u8],
        nodatasum: bool,
        compression: Option<btrfs_disk::items::CompressionType>,
    ) -> io::Result<()> {
        use btrfs_disk::items::{CompressionType, FileExtentItem};

        const MAX_EXTENT_SIZE: usize = 1024 * 1024;

        if data.is_empty() {
            return Err(io::Error::other(
                "write_file_data: empty data not supported",
            ));
        }

        // Small files starting at offset 0 go into an inline extent
        // embedded directly in the FS tree leaf. No allocation, no csums.
        if file_offset == 0
            && data.len()
                <= max_inline_data_size(fs_info.sectorsize, fs_info.nodesize)
        {
            return self.insert_inline_extent(
                fs_info,
                tree_id,
                inode,
                0,
                data,
                compression,
            );
        }

        let sectorsize = u64::from(fs_info.sectorsize);

        let mut chunk_offset = 0usize;
        while chunk_offset < data.len() {
            let chunk_end = (chunk_offset + MAX_EXTENT_SIZE).min(data.len());
            let chunk = &data[chunk_offset..chunk_end];
            let chunk_logical_offset = file_offset + chunk_offset as u64;
            let aligned_logical = align_up(chunk.len() as u64, sectorsize);

            // Per-chunk compression attempt. If the result doesn't
            // shrink the bytes, fall back to raw (compression byte = 0).
            // For LZO this routes through the per-sector framing format.
            let (disk_bytes, comp_byte) = match compression {
                Some(ct) => {
                    match try_compress_regular(chunk, ct, fs_info.sectorsize) {
                        Some(c) => (c, ct),
                        None => (chunk.to_vec(), CompressionType::None),
                    }
                }
                None => (chunk.to_vec(), CompressionType::None),
            };
            let aligned_disk = align_up(disk_bytes.len() as u64, sectorsize);

            let logical = self.alloc_data_extent(
                fs_info,
                &disk_bytes,
                tree_id,
                inode,
                chunk_logical_offset,
            )?;

            let extent_data = FileExtentItem::to_bytes_regular(
                self.transid,
                aligned_logical,
                comp_byte,
                false,
                logical,
                aligned_disk,
                0,
                aligned_logical,
            );
            self.insert_file_extent(
                fs_info,
                tree_id,
                inode,
                chunk_logical_offset,
                &extent_data,
            )?;

            if !nodatasum {
                // Csums cover the on-disk (compressed+padded) bytes.
                let on_disk = fs_info
                    .reader_mut()
                    .read_data(logical, aligned_disk as usize)?;
                self.insert_csums(fs_info, logical, &on_disk)?;
            }

            // INODE.nbytes accounts for the *logical* sector-aligned
            // size, not the (potentially smaller) compressed on-disk
            // size. btrfs check sums num_bytes for found_size.
            self.update_inode_nbytes(
                fs_info,
                tree_id,
                inode,
                aligned_logical as i64,
            )?;

            chunk_offset = chunk_end;
        }

        Ok(())
    }

    /// Insert an `INODE_ITEM` for `inode` in `tree_id`.
    ///
    /// The key is `(inode, INODE_ITEM, 0)`. `args` carries every
    /// on-disk field; see [`crate::inode::InodeArgs`].
    ///
    /// Errors if an `INODE_ITEM` already exists at this key, or if any
    /// tree operation fails.
    pub fn create_inode(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        args: &crate::inode::InodeArgs,
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: inode,
            key_type: KeyType::InodeItem,
            offset: 0,
        };
        let data = args.to_bytes();
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            tree_id,
            &key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + data.len()) as u32),
            true,
        )?;
        if found {
            path.release();
            return Err(io::Error::other(format!(
                "create_inode: INODE_ITEM already exists for inode {inode} \
                 in tree {tree_id}"
            )));
        }
        let leaf = path.nodes[0]
            .as_mut()
            .ok_or_else(|| io::Error::other("create_inode: no leaf in path"))?;
        let slot = path.slots[0];
        items::insert_item(leaf, slot, &key, &data)?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Link a child inode under `name` into `parent_inode`'s directory.
    ///
    /// Inserts the three on-disk records that together make a directory
    /// entry visible to the kernel:
    ///
    /// 1. `DIR_ITEM` at `(parent_inode, DIR_ITEM, name_hash(name))` —
    ///    the name-hash-keyed entry walked on lookup.
    /// 2. `DIR_INDEX` at `(parent_inode, DIR_INDEX, dir_index)` — the
    ///    monotonically-increasing entry walked on `readdir`.
    /// 3. `INODE_REF` at `(child_inode, INODE_REF, parent_inode)` —
    ///    the back-pointer used by `..` resolution and orphan recovery.
    ///
    /// Bumps `parent_inode`'s on-disk `size` by `2 * name.len()` (one
    /// `name_len` per `DIR_ITEM`/`DIR_INDEX` entry) per modern btrfs's
    /// directory-isize convention. If `parent_inode` is the canonical
    /// subvolume root directory inode (`BTRFS_FIRST_FREE_OBJECTID` =
    /// 256), also mirrors the size into the matching `ROOT_ITEM`'s
    /// embedded `inode_data` so `btrfs check`'s root-item consistency
    /// check passes.
    ///
    /// `dir_index` is the caller-supplied per-directory monotonic
    /// counter. mkfs starts at 2 (entries 0 and 1 are reserved for
    /// `.` and `..`) and increments per child.
    ///
    /// `file_type` is the `BTRFS_FT_*` byte (regular file, directory,
    /// symlink, etc.).
    ///
    /// # Errors
    ///
    /// Returns an error if any of the inserted items already exist, if
    /// the parent inode is missing, or if any tree operation fails.
    #[allow(clippy::too_many_arguments)]
    pub fn link_dir_entry(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        parent_inode: u64,
        child_inode: u64,
        name: &[u8],
        file_type: u8,
        dir_index: u64,
        time: btrfs_disk::items::Timespec,
    ) -> io::Result<()> {
        use btrfs_disk::{
            items::{DirItem, InodeRef},
            util::btrfs_name_hash,
        };

        let transid = self.transid;

        // 1. INODE_REF: child -> parent.
        let iref_data = InodeRef::serialize(dir_index, name);
        let iref_key = DiskKey {
            objectid: child_inode,
            key_type: KeyType::InodeRef,
            offset: parent_inode,
        };
        self.insert_item_helper(fs_info, tree_id, &iref_key, &iref_data)?;

        // 2. DIR_ITEM: parent -> child, keyed by name hash.
        let location = DiskKey {
            objectid: child_inode,
            key_type: KeyType::InodeItem,
            offset: 0,
        };
        let dir_data = DirItem::serialize(&location, transid, file_type, name);
        let dir_item_key = DiskKey {
            objectid: parent_inode,
            key_type: KeyType::DirItem,
            offset: u64::from(btrfs_name_hash(name)),
        };
        self.insert_item_helper(fs_info, tree_id, &dir_item_key, &dir_data)?;

        // 3. DIR_INDEX: parent -> child, keyed by readdir index.
        let dir_index_key = DiskKey {
            objectid: parent_inode,
            key_type: KeyType::DirIndex,
            offset: dir_index,
        };
        self.insert_item_helper(fs_info, tree_id, &dir_index_key, &dir_data)?;

        // 4. Bump parent dir's size by 2 * name_len (one per DIR_ITEM /
        //    DIR_INDEX). Patch in place to preserve flags / rdev / etc.
        self.bump_dir_size(
            fs_info,
            tree_id,
            parent_inode,
            (name.len() as u64) * 2,
            transid,
            time,
        )?;

        // 5. If the parent is the canonical subvolume root dir
        //    (inode 256), mirror the size update into the matching
        //    ROOT_ITEM's embedded inode so the root-tree consistency
        //    check passes.
        if parent_inode == u64::from(btrfs_disk::raw::BTRFS_FIRST_FREE_OBJECTID)
        {
            self.mirror_root_item_size(
                fs_info,
                tree_id,
                (name.len() as u64) * 2,
            )?;
        }

        Ok(())
    }

    /// Link a subvolume tree (`subvol_id`) under `name` into a directory
    /// entry of `parent_inode`.
    ///
    /// Same shape as [`link_dir_entry`](Self::link_dir_entry) but the
    /// directory entry's location key points at a `ROOT_ITEM` instead
    /// of an `INODE_ITEM`, and no `INODE_REF` is emitted (the parent
    /// linkage for subvolumes is recorded via `ROOT_REF` /
    /// `ROOT_BACKREF` in the root tree, not via `INODE_REF` in the
    /// containing FS tree). Use
    /// [`insert_root_ref`](Self::insert_root_ref) to insert those
    /// after this call.
    ///
    /// Inserts:
    ///
    /// 1. `DIR_ITEM` at `(parent_inode, DIR_ITEM, name_hash(name))`
    ///    with `location = (subvol_id, ROOT_ITEM, 0)` and
    ///    `file_type = BTRFS_FT_DIR`.
    /// 2. `DIR_INDEX` at `(parent_inode, DIR_INDEX, dir_index)` with
    ///    the same payload.
    ///
    /// Bumps `parent_inode.size` by `2 * name.len()` and (if
    /// `parent_inode == 256`) mirrors the size into the containing
    /// `ROOT_ITEM`'s embedded inode, exactly like `link_dir_entry`.
    ///
    /// # Errors
    ///
    /// Returns an error if either inserted item already exists, or
    /// if any underlying tree operation fails.
    #[allow(clippy::too_many_arguments)]
    pub fn link_subvol_entry(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        parent_inode: u64,
        subvol_id: u64,
        name: &[u8],
        dir_index: u64,
        time: btrfs_disk::items::Timespec,
    ) -> io::Result<()> {
        use btrfs_disk::{items::DirItem, util::btrfs_name_hash};

        let transid = self.transid;

        let location = DiskKey {
            objectid: subvol_id,
            key_type: KeyType::RootItem,
            offset: 0,
        };
        let dir_data = DirItem::serialize(
            &location,
            transid,
            btrfs_disk::raw::BTRFS_FT_DIR as u8,
            name,
        );

        let dir_item_key = DiskKey {
            objectid: parent_inode,
            key_type: KeyType::DirItem,
            offset: u64::from(btrfs_name_hash(name)),
        };
        self.insert_item_helper(fs_info, tree_id, &dir_item_key, &dir_data)?;

        let dir_index_key = DiskKey {
            objectid: parent_inode,
            key_type: KeyType::DirIndex,
            offset: dir_index,
        };
        self.insert_item_helper(fs_info, tree_id, &dir_index_key, &dir_data)?;

        self.bump_dir_size(
            fs_info,
            tree_id,
            parent_inode,
            (name.len() as u64) * 2,
            transid,
            time,
        )?;

        if parent_inode == u64::from(btrfs_disk::raw::BTRFS_FIRST_FREE_OBJECTID)
        {
            self.mirror_root_item_size(
                fs_info,
                tree_id,
                (name.len() as u64) * 2,
            )?;
        }
        Ok(())
    }

    /// Mirror a `+delta` size bump from inode 256's `INODE_ITEM` into
    /// the embedded `inode_data` of `tree_id`'s `ROOT_ITEM` so the
    /// kernel's root-tree consistency check (and `btrfs check`) sees
    /// matching values.
    fn mirror_root_item_size(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        delta: u64,
    ) -> io::Result<()> {
        use btrfs_disk::items::RootItem;

        let root_key = DiskKey {
            objectid: tree_id,
            key_type: KeyType::RootItem,
            offset: 0,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            1,
            &root_key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "mirror_root_item_size: ROOT_ITEM for tree {tree_id} not found"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("mirror_root_item_size: no leaf in path")
        })?;
        let slot = path.slots[0];
        let ri_data = leaf.item_data(slot).to_vec();
        let mut root_item = RootItem::parse(&ri_data).ok_or_else(|| {
            io::Error::other("mirror_root_item_size: malformed ROOT_ITEM")
        })?;
        let size_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, size);
        if root_item.inode_data.len() < size_off + 8 {
            path.release();
            return Err(io::Error::other(
                "mirror_root_item_size: ROOT_ITEM inode_data shorter than btrfs_inode_item",
            ));
        }
        let mut size = u64::from_le_bytes(
            root_item.inode_data[size_off..size_off + 8]
                .try_into()
                .unwrap(),
        );
        size += delta;
        root_item.inode_data[size_off..size_off + 8]
            .copy_from_slice(&size.to_le_bytes());
        let new_ri = root_item.to_bytes();
        items::update_item(leaf, slot, &new_ri)?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Insert a single item via the standard search-and-insert pipeline.
    /// Internal helper for the high-level dir/inode/xattr APIs.
    fn insert_item_helper(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        key: &DiskKey,
        data: &[u8],
    ) -> io::Result<()> {
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            tree_id,
            key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + data.len()) as u32),
            true,
        )?;
        if found {
            path.release();
            return Err(io::Error::other(format!(
                "insert_item_helper: item already exists at {key:?} in tree {tree_id}"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("insert_item_helper: no leaf in path")
        })?;
        let slot = path.slots[0];
        items::insert_item(leaf, slot, key, data)?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Bump a directory inode's `size` by `delta`, refresh `transid`,
    /// and update `ctime`/`mtime`. Patches in place at fixed offsets to
    /// preserve flags / rdev / sequence and other fields not modeled by
    /// `InodeArgs` round-tripping.
    fn bump_dir_size(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        delta: u64,
        transid: u64,
        time: btrfs_disk::items::Timespec,
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: inode,
            key_type: KeyType::InodeItem,
            offset: 0,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            tree_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "bump_dir_size: INODE_ITEM missing for inode {inode}"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("bump_dir_size: no leaf in path")
        })?;
        let slot = path.slots[0];

        let transid_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, transid);
        let size_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, size);
        let ctime_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, ctime);
        let mtime_off =
            std::mem::offset_of!(btrfs_disk::raw::btrfs_inode_item, mtime);

        let payload = leaf.item_data_mut(slot);
        // size += delta
        let mut size = u64::from_le_bytes(
            payload[size_off..size_off + 8].try_into().unwrap(),
        );
        size += delta;
        payload[size_off..size_off + 8].copy_from_slice(&size.to_le_bytes());
        // transid
        payload[transid_off..transid_off + 8]
            .copy_from_slice(&transid.to_le_bytes());
        // ctime
        payload[ctime_off..ctime_off + 8]
            .copy_from_slice(&time.sec.to_le_bytes());
        payload[ctime_off + 8..ctime_off + 12]
            .copy_from_slice(&time.nsec.to_le_bytes());
        // mtime
        payload[mtime_off..mtime_off + 8]
            .copy_from_slice(&time.sec.to_le_bytes());
        payload[mtime_off + 8..mtime_off + 12]
            .copy_from_slice(&time.nsec.to_le_bytes());

        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Insert an `XATTR_ITEM` carrying `(name, value)` for `inode`.
    ///
    /// XATTR items share the on-disk format with `DIR_ITEM`: the
    /// "directory entry" is a `(name, value)` pair where the value
    /// fills the data area instead of being zero-length. The key is
    /// `(inode, XATTR_ITEM, name_hash(name))`.
    ///
    /// # Errors
    ///
    /// Returns an error if an XATTR with the same name hash already
    /// exists (no chain-walk-and-append for collisions in v1), or if
    /// any tree operation fails.
    pub fn set_xattr(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
        inode: u64,
        name: &[u8],
        value: &[u8],
    ) -> io::Result<()> {
        use btrfs_disk::util::btrfs_name_hash;
        use bytes::BufMut;

        // XATTR_ITEM payload: 17-byte location key (zeroed) + 8B transid
        // + 2B data_len + 2B name_len + 1B type + name + value.
        let total = 17 + 8 + 2 + 2 + 1 + name.len() + value.len();
        let mut data = Vec::with_capacity(total);
        // location key — for xattrs the location is unused; mkfs writes
        // it as the all-zero "Untyped" key.
        data.extend_from_slice(&[0u8; 17]);
        data.put_u64_le(self.transid);
        data.put_u16_le(value.len() as u16);
        data.put_u16_le(name.len() as u16);
        data.put_u8(btrfs_disk::raw::BTRFS_FT_XATTR as u8);
        data.put_slice(name);
        data.put_slice(value);
        debug_assert_eq!(data.len(), total);

        let key = DiskKey {
            objectid: inode,
            key_type: KeyType::XattrItem,
            offset: u64::from(btrfs_name_hash(name)),
        };
        self.insert_item_helper(fs_info, tree_id, &key, &data)
    }

    /// Insert paired `ROOT_REF` and `ROOT_BACKREF` items into the root
    /// tree (id 1) recording that subvolume `child_root` is reachable
    /// under `name` (with directory sequence `dir_index`) inside
    /// directory inode `dirid` of `parent_root`.
    ///
    /// The two records share an on-disk payload (`btrfs_root_ref`) and
    /// differ only in their key:
    ///
    /// - `ROOT_REF` at `(parent_root, ROOT_REF, child_root)` — the
    ///   parent → child entry walked when listing a parent's
    ///   subvolume children.
    /// - `ROOT_BACKREF` at `(child_root, ROOT_BACKREF, parent_root)`
    ///   — the child → parent backref used by `subvolume list` to
    ///   reconstruct the path from a subvolume up to the mount root.
    ///
    /// Both items are inserted in tree id 1 (the root tree). Caller
    /// is responsible for keeping the matching `DIR_ITEM` and
    /// `DIR_INDEX` entries inside `parent_root`'s FS tree consistent
    /// (typically via a parallel `link_dir_entry`-style emission).
    ///
    /// # Errors
    ///
    /// Returns an error if either item already exists, or if any tree
    /// operation fails.
    pub fn insert_root_ref(
        &mut self,
        fs_info: &mut Filesystem<R>,
        parent_root: u64,
        child_root: u64,
        dirid: u64,
        dir_index: u64,
        name: &[u8],
    ) -> io::Result<()> {
        use btrfs_disk::items::RootRef;

        let payload = RootRef::serialize(dirid, dir_index, name);
        let root_tree_id = 1u64;

        let ref_key = DiskKey {
            objectid: parent_root,
            key_type: KeyType::RootRef,
            offset: child_root,
        };
        self.insert_item_helper(fs_info, root_tree_id, &ref_key, &payload)?;

        let backref_key = DiskKey {
            objectid: child_root,
            key_type: KeyType::RootBackref,
            offset: parent_root,
        };
        self.insert_item_helper(fs_info, root_tree_id, &backref_key, &payload)?;

        Ok(())
    }

    /// Set the `RDONLY` bit in the `ROOT_ITEM.flags` of `tree_id`'s
    /// root tree entry, marking the subvolume read-only.
    ///
    /// Reads the existing `ROOT_ITEM` at `(tree_id, ROOT_ITEM, 0)` in
    /// the root tree, parses it, ORs `RootItemFlags::RDONLY` into
    /// `flags`, reserialises, and overwrites in place. Other fields
    /// are preserved verbatim.
    ///
    /// Idempotent: calling on a tree that is already RDONLY leaves
    /// the on-disk bytes unchanged (but still marks the leaf dirty,
    /// which costs one COW per call).
    ///
    /// # Errors
    ///
    /// Returns an error if no `ROOT_ITEM` exists for `tree_id`, or if
    /// any tree operation fails.
    pub fn set_root_readonly(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
    ) -> io::Result<()> {
        use btrfs_disk::items::{RootItem, RootItemFlags};

        let key = DiskKey {
            objectid: tree_id,
            key_type: KeyType::RootItem,
            offset: 0,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            1, // root tree
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "set_root_readonly: ROOT_ITEM for tree {tree_id} not found"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("set_root_readonly: no leaf in path")
        })?;
        let slot = path.slots[0];
        let ri_data = leaf.item_data(slot).to_vec();
        let mut root_item = RootItem::parse(&ri_data).ok_or_else(|| {
            io::Error::other("set_root_readonly: malformed ROOT_ITEM")
        })?;
        root_item.flags |= RootItemFlags::RDONLY;
        let new_ri = root_item.to_bytes();
        items::update_item(leaf, slot, &new_ri)?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Mark `subvol_id` as the filesystem's default subvolume by
    /// upserting a `DIR_ITEM` keyed under
    /// `(BTRFS_ROOT_TREE_DIR_OBJECTID = 6, DIR_ITEM,
    /// name_hash("default"))` in the root tree (id 1). The entry's
    /// location key is `(subvol_id, ROOT_ITEM, u64::MAX)`, matching
    /// mkfs's convention; kernel `btrfs_find_root` does a range
    /// search by objectid so the offset value is not load-bearing.
    /// On mount without an explicit `subvolid=`, the kernel resolves
    /// the default to `subvol_id`.
    ///
    /// On-disk equivalent of `btrfs subvolume set-default
    /// <subvol_id>`. Idempotent: if a "default" `DIR_ITEM` already
    /// exists at this key (e.g. mkfs's default pointing at the FS
    /// tree), it is overwritten in place. The payload size is
    /// independent of `subvol_id`, so in-place update works without
    /// a delete+insert cycle.
    ///
    /// The matching `INODE_ITEM` for inode 6 is left to the caller
    /// (mkfs historically did not create one — the kernel does not
    /// require it for the default-subvol lookup path).
    ///
    /// # Errors
    ///
    /// Returns an error if any tree operation fails.
    pub fn set_default_subvol(
        &mut self,
        fs_info: &mut Filesystem<R>,
        subvol_id: u64,
    ) -> io::Result<()> {
        use btrfs_disk::{items::DirItem, util::btrfs_name_hash};

        const DEFAULT_NAME: &[u8] = b"default";

        let location = DiskKey {
            objectid: subvol_id,
            key_type: KeyType::RootItem,
            offset: u64::MAX,
        };
        let payload = DirItem::serialize(
            &location,
            self.transid,
            btrfs_disk::raw::BTRFS_FT_DIR as u8,
            DEFAULT_NAME,
        );
        let key = DiskKey {
            objectid: u64::from(btrfs_disk::raw::BTRFS_ROOT_TREE_DIR_OBJECTID),
            key_type: KeyType::DirItem,
            offset: u64::from(btrfs_name_hash(DEFAULT_NAME)),
        };

        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            1,
            &key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + payload.len()) as u32),
            true,
        )?;
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("set_default_subvol: no leaf in path")
        })?;
        let slot = path.slots[0];
        if found {
            // Existing "default" DIR_ITEM (e.g. mkfs's bootstrap one
            // pointing at the FS tree). Same payload size — patch in
            // place rather than delete+insert.
            items::update_item(leaf, slot, &payload)?;
        } else {
            items::insert_item(leaf, slot, &key, &payload)?;
        }
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Patch the `total_bytes` field of a device's `DEV_ITEM` in the
    /// chunk tree (id 3) and mirror the change into the superblock's
    /// embedded `dev_item` if `devid` matches the primary device.
    ///
    /// Returns the previous `total_bytes` value (for accounting at
    /// the call site, e.g. updating `fs.superblock.total_bytes`).
    ///
    /// Used by:
    /// - `mkfs --rootdir --shrink`: trim the device after rootdir
    ///   population so the image is no larger than the chunk layout
    ///   actually requires.
    /// - `rescue fix-device-size`: re-align device totals when the
    ///   on-disk values disagree with the underlying block device or
    ///   image file (currently does its own version of this; can be
    ///   migrated to this helper).
    ///
    /// The caller is responsible for setting `fs.superblock.total_bytes`
    /// (the sum across all devices) — this helper only touches the
    /// per-device `DEV_ITEM` and the superblock's embedded
    /// `dev_item.total_bytes` (single-device convenience). For
    /// multi-device callers, sum the per-device values manually.
    ///
    /// # Errors
    ///
    /// Returns an error if the `DEV_ITEM` for `devid` cannot be
    /// found in the chunk tree, or if any tree operation fails.
    pub fn set_device_total_bytes(
        &mut self,
        fs_info: &mut Filesystem<R>,
        devid: u64,
        new_total: u64,
    ) -> io::Result<u64> {
        // Byte offset of `total_bytes` inside the on-disk
        // `btrfs_dev_item`: it follows the leading u64 `devid`.
        const TOTAL_BYTES_OFFSET: usize = 8;

        let key = DiskKey {
            objectid: u64::from(btrfs_disk::raw::BTRFS_DEV_ITEMS_OBJECTID),
            key_type: KeyType::DeviceItem,
            offset: devid,
        };
        let chunk_tree = u64::from(btrfs_disk::raw::BTRFS_CHUNK_TREE_OBJECTID);
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            chunk_tree,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "set_device_total_bytes: DEV_ITEM for devid {devid} not found"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("set_device_total_bytes: no leaf in path")
        })?;
        let slot = path.slots[0];
        let item_len = leaf.item_size(slot) as usize;
        if item_len < TOTAL_BYTES_OFFSET + 8 {
            path.release();
            return Err(io::Error::other(format!(
                "set_device_total_bytes: DEV_ITEM payload {item_len} bytes < {}",
                TOTAL_BYTES_OFFSET + 8,
            )));
        }
        let payload = leaf.item_data_mut(slot);
        let old_total = u64::from_le_bytes(
            payload[TOTAL_BYTES_OFFSET..TOTAL_BYTES_OFFSET + 8]
                .try_into()
                .unwrap(),
        );
        payload[TOTAL_BYTES_OFFSET..TOTAL_BYTES_OFFSET + 8]
            .copy_from_slice(&new_total.to_le_bytes());
        fs_info.mark_dirty(leaf);
        path.release();

        // Mirror into the superblock's embedded dev_item if this is
        // the primary device. (For multi-device, the caller handles
        // each device separately; only the primary's dev_item lives
        // in the superblock.)
        if fs_info.superblock.dev_item.devid == devid {
            fs_info.superblock.dev_item.total_bytes = new_total;
        }

        Ok(old_total)
    }

    /// Mark a block as pinned (freed but not yet committed).
    ///
    /// Pinned blocks must not be reallocated during this transaction.
    /// The previous superblock still references them, so reusing the
    /// address before the new superblock is committed would corrupt the
    /// old consistent state on crash.
    pub fn pin_block(&mut self, logical: u64) {
        self.pinned.insert(logical);
    }

    /// Check whether a logical address is pinned.
    #[must_use]
    pub fn is_pinned(&self, logical: u64) -> bool {
        self.pinned.contains(&logical)
    }

    /// Queue a block to be freed after commit.
    pub fn queue_free_block(&mut self, logical: u64) {
        self.freed_blocks.push(logical);
    }

    /// Materialise a fresh empty global tree with the given objectid.
    ///
    /// Allocates a single metadata block, initialises it as an empty
    /// level-0 leaf carrying `tree_id` as its owner, registers
    /// `(tree_id -> bytenr)` in the in-memory roots map, and inserts a
    /// `ROOT_ITEM` keyed `(tree_id, ROOT_ITEM, 0)` into the root tree
    /// pointing at the new block.
    ///
    /// The new leaf and root-item are staged but not flushed; the
    /// caller must invoke `commit` (possibly after inserting items
    /// into the new tree) for them to land on disk. Subsequent items
    /// inserted into the new tree go through the normal
    /// `search_slot`/insert pipeline and may COW the empty leaf away.
    ///
    /// This is the foundation primitive for whole-tree creation
    /// (e.g. `convert-to-free-space-tree`,
    /// `convert-to-block-group-tree`). It does **not** create the
    /// root tree (id 1), the chunk tree (id 3), or the extent tree
    /// (id 2): those are bootstrap state managed by the superblock
    /// and the existing transaction pipeline.
    ///
    /// Returns the logical bytenr of the freshly allocated leaf.
    ///
    /// # Errors
    ///
    /// * `tree_id` is `0`, `1`, `2`, or `3`.
    /// * `tree_id` already has a root in the in-memory roots map.
    /// * The metadata allocator fails.
    /// * The root-tree insert fails.
    pub fn create_empty_tree(
        &mut self,
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
    ) -> io::Result<u64> {
        // Reject bootstrap trees: their roots live in the superblock
        // (1, 3) or are required for the allocator/extent bookkeeping
        // itself (2). Allowing this primitive to overwrite them would
        // corrupt the in-memory roots map and break commit.
        if matches!(tree_id, 0..=3) {
            return Err(io::Error::other(format!(
                "create_empty_tree: tree id {tree_id} is reserved bootstrap state",
            )));
        }

        if fs_info.root_bytenr(tree_id).is_some() {
            return Err(io::Error::other(format!(
                "create_empty_tree: tree id {tree_id} already exists",
            )));
        }

        // Source fsid and chunk_tree_uuid from the existing root tree
        // root block. Every tree block in a healthy btrfs filesystem
        // shares these (the chunk_tree_uuid is the chunk root's uuid,
        // and the fsid is the metadata uuid when the METADATA_UUID
        // incompat flag is set, otherwise the plain fsid). Inheriting
        // matches what cow_block, split_leaf, and split_node do for
        // every other allocation, so the new leaf is structurally
        // indistinguishable from a COWed one to btrfs check.
        let (fsid, chunk_tree_uuid) = {
            let root_bytenr = fs_info.root_bytenr(1).ok_or_else(|| {
                io::Error::other(
                    "create_empty_tree: root tree (id 1) has no root bytenr",
                )
            })?;
            let eb = fs_info.read_block(root_bytenr)?;
            (eb.fsid(), eb.chunk_tree_uuid())
        };

        // Allocate the leaf block and queue its +1 metadata extent
        // ref. alloc_tree_block routes to a metadata block group and
        // also records the allocation in bg_range_deltas, which keeps
        // the free space tree in sync at commit.
        let new_logical = self.alloc_tree_block(fs_info, tree_id, 0)?;

        // Build the empty leaf header. WRITTEN is left clear: the
        // commit's flush_dirty pass sets it before checksumming.
        let nodesize = fs_info.nodesize;
        let mut new_eb = ExtentBuffer::new_zeroed(nodesize, new_logical);
        new_eb.set_bytenr(new_logical);
        new_eb.set_level(0);
        new_eb.set_nritems(0);
        new_eb.set_generation(self.transid);
        new_eb.set_owner(tree_id);
        new_eb.set_fsid(&fsid);
        new_eb.set_chunk_tree_uuid(&chunk_tree_uuid);
        // The header `flags` field encodes the backref revision in
        // its top 8 bits (BTRFS_BACKREF_REV_SHIFT = 56). Modern btrfs
        // uses BTRFS_MIXED_BACKREF_REV = 1; a leaf with revision 0
        // would be parsed as the obsolete pre-mixed-backref format
        // and rejected by btrfs check. WRITTEN (bit 0) stays clear:
        // flush_dirty sets it before checksumming.
        new_eb.set_flags(
            u64::from(btrfs_disk::raw::BTRFS_MIXED_BACKREF_REV)
                << btrfs_disk::raw::BTRFS_BACKREF_REV_SHIFT,
        );

        debug_assert_eq!(new_eb.level(), 0);
        debug_assert_eq!(new_eb.nritems(), 0);
        debug_assert_eq!(new_eb.owner(), tree_id);
        debug_assert_eq!(new_eb.generation(), self.transid);
        debug_assert_eq!(
            new_eb.leaf_free_space(),
            nodesize - HEADER_SIZE as u32,
            "create_empty_tree: empty leaf must have full free space",
        );

        fs_info.mark_dirty(&new_eb);
        fs_info.set_root_bytenr(tree_id, new_logical);

        // Insert the ROOT_ITEM into the root tree.
        let root_item = RootItem::new_internal(self.transid, new_logical, 0);
        let root_item_bytes = root_item.to_bytes();
        let root_item_key = DiskKey {
            objectid: tree_id,
            key_type: KeyType::RootItem,
            offset: 0,
        };

        let root_tree_id = 1u64;
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            root_tree_id,
            &root_item_key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + root_item_bytes.len()) as u32),
            true,
        )?;
        if found {
            path.release();
            return Err(io::Error::other(format!(
                "create_empty_tree: ROOT_ITEM for tree {tree_id} already in root tree",
            )));
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("create_empty_tree: no leaf in path after search")
        })?;
        items::insert_item(
            leaf,
            path.slots[0],
            &root_item_key,
            &root_item_bytes,
        )?;
        fs_info.mark_dirty(leaf);
        path.release();

        debug_assert_eq!(
            fs_info.root_bytenr(tree_id),
            Some(new_logical),
            "create_empty_tree: roots map not updated",
        );

        Ok(new_logical)
    }

    /// Replace the chunk tree with a fresh empty one.
    ///
    /// This is the apply primitive for `rescue chunk-recover`. It:
    ///
    /// 1. Clears and rebuilds the superblock's `sys_chunk_array` from
    ///    the provided SYSTEM chunk records. This ensures that
    ///    `ensure_in_sys_chunk_array` (called during SYSTEM block
    ///    allocation) finds the entries already present and skips the
    ///    chunk tree read that would fail on a damaged filesystem.
    ///
    /// 2. Allocates a fresh SYSTEM block for the new chunk tree root.
    ///
    /// 3. Initializes it as an empty level-0 leaf owned by tree ID 3,
    ///    with proper fsid, `chunk_tree_uuid`, and backref revision.
    ///
    /// After this call, the caller inserts `DEV_ITEM` and `CHUNK_ITEM`
    /// records into tree ID 3 via the normal `search_slot`/`insert_item`
    /// pipeline, then calls `commit()`. The commit automatically updates
    /// `superblock.chunk_root` and `chunk_root_level`.
    ///
    /// `system_chunks` is a list of `(bg_start, chunk_bytes)` pairs
    /// where `chunk_bytes` is the serialized `btrfs_chunk` (from
    /// `chunk_item_bytes`). Only SYSTEM-type chunks should be included.
    ///
    /// Returns the logical address of the new chunk tree root leaf.
    ///
    /// # Errors
    ///
    /// Returns an error if the `sys_chunk_array` overflows, the allocator
    /// fails, or the root tree is unreadable.
    pub fn rebuild_chunk_tree(
        &mut self,
        fs_info: &mut Filesystem<R>,
        system_chunks: &[(u64, Vec<u8>)],
    ) -> io::Result<u64> {
        // Step 1: rebuild sys_chunk_array from the provided SYSTEM chunks.
        fs_info.superblock.sys_chunk_array_size = 0;
        fs_info.superblock.sys_chunk_array = [0; 2048];
        for (bg_start, chunk_bytes) in system_chunks {
            sys_chunk_array_append(
                &mut fs_info.superblock.sys_chunk_array,
                &mut fs_info.superblock.sys_chunk_array_size,
                *bg_start,
                chunk_bytes,
            )
            .map_err(io::Error::other)?;
        }

        // Step 2: allocate a fresh SYSTEM block for the chunk tree root.
        // ensure_in_sys_chunk_array will find the entry and return early.
        let chunk_tree_id =
            u64::from(btrfs_disk::raw::BTRFS_CHUNK_TREE_OBJECTID);
        let new_logical = self.alloc_tree_block(fs_info, chunk_tree_id, 0)?;

        // Step 3: initialize as empty leaf, same pattern as create_empty_tree.
        let (fsid, chunk_tree_uuid) = {
            let root_bytenr = fs_info.root_bytenr(1).ok_or_else(|| {
                io::Error::other("rebuild_chunk_tree: root tree has no root")
            })?;
            let eb = fs_info.read_block(root_bytenr)?;
            (eb.fsid(), eb.chunk_tree_uuid())
        };

        let nodesize = fs_info.nodesize;
        let mut new_eb = ExtentBuffer::new_zeroed(nodesize, new_logical);
        new_eb.set_bytenr(new_logical);
        new_eb.set_level(0);
        new_eb.set_nritems(0);
        new_eb.set_generation(self.transid);
        new_eb.set_owner(chunk_tree_id);
        new_eb.set_fsid(&fsid);
        new_eb.set_chunk_tree_uuid(&chunk_tree_uuid);
        new_eb.set_flags(
            u64::from(btrfs_disk::raw::BTRFS_MIXED_BACKREF_REV)
                << btrfs_disk::raw::BTRFS_BACKREF_REV_SHIFT,
        );

        fs_info.mark_dirty(&new_eb);
        fs_info.set_root_bytenr(chunk_tree_id, new_logical);

        Ok(new_logical)
    }

    /// Commit the transaction: update root items, flush delayed refs, write
    /// all dirty blocks, update the superblock, and write to all mirrors.
    ///
    /// This is the full commit sequence per the spec:
    /// 1. Update root items in the root tree for trees whose root changed
    /// 2. Flush delayed reference count updates (convergence loop)
    /// 3. Write all dirty tree blocks to disk with correct checksums
    /// 4. Update superblock (generation, root pointers, byte counts)
    /// 5. Write superblock to all mirrors
    ///
    /// # Errors
    ///
    /// Returns an error if any tree modification, write, or fsync fails.
    pub fn commit(mut self, fs_info: &mut Filesystem<R>) -> io::Result<()> {
        // Step 0: Force-COW the root tree root so that every commit
        // rewrites at least one block at the new generation. This keeps
        // `superblock.generation` and the root tree root's
        // `header.generation` in lockstep, which is what `btrfs check`
        // (and the kernel mount path) verify. Without this, a no-op
        // commit would either need to be short-circuited or would
        // corrupt the filesystem with "parent transid verify failed".
        // See PLAN.md Finding 3 invariants I1, I2, I7.
        //
        // `cow_block` is idempotent: if the root tree was already COWed
        // earlier in this transaction (its in-memory generation matches
        // and the block is not yet written to disk), it returns the
        // existing buffer unchanged. The new add/drop delayed refs and
        // the new dirty block flow into the convergence loop below.
        let root_tree_id = 1u64;
        if let Some(root_bytenr) = fs_info.root_bytenr(root_tree_id) {
            let eb = fs_info.read_block(root_bytenr)?;
            let new_eb =
                cow_block(&mut self, fs_info, &eb, root_tree_id, None)?;
            if new_eb.logical() != root_bytenr {
                fs_info.set_root_bytenr(root_tree_id, new_eb.logical());
            }
        }

        // Step 1: Convergence loop. Flushing delayed refs modifies the
        // extent tree (COW), which generates new delayed refs. Updating
        // root items modifies the root tree (COW), generating more.
        // Alternate until both are stable.
        let max_passes = 32;
        for pass in 0..max_passes {
            self.flush_delayed_refs(fs_info)?;
            self.update_root_items(fs_info)?;
            // Snapshot roots BEFORE update_free_space_tree so the next
            // pass's update_root_items picks up the FST root change.
            // If we snapshotted after update_FST, the new FST root
            // would already be in the snapshot baseline and would
            // never be written to the on-disk ROOT_ITEM, leaving the
            // old extent items referenced by a stale root pointer and
            // the new ones orphaned.
            fs_info.snapshot_roots();
            let fst_changed = self.update_free_space_tree(fs_info)?;

            // Stable when no pending refs, no changed roots remain
            // (changed_roots since the snapshot we just took, which
            // captures any changes update_free_space_tree made), no
            // FST updates were produced, and no new range deltas were
            // accumulated.
            if self.delayed_refs.is_empty()
                && fs_info.changed_roots().is_empty()
                && self.bg_range_deltas.is_empty()
                && !fst_changed
            {
                break;
            }

            if pass == max_passes - 1 {
                return Err(io::Error::other(
                    "commit convergence loop did not stabilize",
                ));
            }
        }

        // Step 2: Flush all dirty blocks to disk
        fs_info.flush_dirty()?;

        // Step 4: Update superblock fields
        fs_info.superblock.generation = self.transid;

        // The free space tree was updated incrementally inside the
        // convergence loop above. FREE_SPACE_TREE_VALID stays set
        // because the on-disk FST is now consistent with the extent
        // tree.

        // Update root tree root pointer
        if let Some(root_bytenr) = fs_info.root_bytenr(1) {
            fs_info.superblock.root = root_bytenr;
            if let Ok(eb) = fs_info.read_block(root_bytenr) {
                fs_info.superblock.root_level = eb.level();
            }
        }

        // Update chunk tree root pointer (only if it changed)
        if let Some(chunk_bytenr) = fs_info.root_bytenr(3)
            && chunk_bytenr != fs_info.superblock.chunk_root
        {
            fs_info.superblock.chunk_root = chunk_bytenr;
            fs_info.superblock.chunk_root_generation = self.transid;
            if let Ok(eb) = fs_info.read_block(chunk_bytenr) {
                fs_info.superblock.chunk_root_level = eb.level();
            }
        }

        // Pre-write superblock invariants. These are hard assertions
        // (not debug_assert) because writing a corrupt superblock is
        // unrecoverable.
        assert_eq!(
            fs_info.superblock.generation, self.transid,
            "commit: superblock generation {} != transid {}",
            fs_info.superblock.generation, self.transid,
        );
        assert_eq!(
            fs_info.superblock.root,
            fs_info.root_bytenr(1).unwrap_or(0),
            "commit: superblock.root doesn't match in-memory root tree root",
        );
        // bytes_used must be at least 6 * nodesize (kernel minimum).
        let min_bytes_used = 6 * u64::from(fs_info.nodesize);
        assert!(
            fs_info.superblock.bytes_used >= min_bytes_used,
            "commit: bytes_used {} below kernel minimum {} \
             (6 * nodesize {})",
            fs_info.superblock.bytes_used,
            min_bytes_used,
            fs_info.nodesize,
        );
        // All delayed refs must have been flushed.
        assert!(
            self.delayed_refs.is_empty(),
            "commit: {} delayed refs still pending at superblock write",
            self.delayed_refs.len(),
        );

        // Step 5: Update backup roots (rotating through 4 slots)
        let backup_idx = (self.transid % 4) as usize;
        update_backup_root(fs_info, backup_idx);

        // Step 6: Write superblock to all mirrors of every open device.
        //
        // For multi-device filesystems each device has its own
        // `dev_item.devid` / `dev_item.dev_uuid` embedded in the
        // superblock; preserving those across writes is handled by
        // `Filesystem::write_superblock_all_devices`, which splices
        // the per-device dev_item into the in-memory superblock
        // before serializing.
        fs_info.write_superblock_all_devices()?;

        // Step 7: Flush writes to stable storage. `Write::flush()`
        // flushes any userspace buffers. For file-backed storage, the
        // caller should also call `sync()` on the Filesystem (which
        // calls `File::sync_all()` per device) for full durability.
        for dev in fs_info.reader_mut().devices_mut().values_mut() {
            dev.flush()?;
        }

        // Step 8: Clean up
        self.bg_range_deltas.clear();
        fs_info.clear_dirty();
        fs_info.clear_cache();

        Ok(())
    }

    /// Update `ROOT_ITEM` entries in the root tree for every tree whose root
    /// block changed during this transaction.
    ///
    /// For each changed tree, searches the root tree for the existing
    /// `ROOT_ITEM`, parses it, updates the bytenr/generation/level fields,
    /// re-serializes it, and writes it back in place.
    fn update_root_items(
        &mut self,
        fs_info: &mut Filesystem<R>,
    ) -> io::Result<()> {
        let changed = fs_info.changed_roots();
        if changed.is_empty() {
            return Ok(());
        }

        // Root tree ID = 1
        let root_tree_id = 1u64;

        for (tree_id, new_bytenr, new_level) in changed {
            let key = DiskKey {
                objectid: tree_id,
                key_type: KeyType::RootItem,
                offset: 0,
            };

            let mut path = BtrfsPath::new();
            let found = search::search_slot(
                Some(&mut *self),
                fs_info,
                root_tree_id,
                &key,
                &mut path,
                SearchIntent::ReadOnly,
                true, // COW the path so we can modify the leaf
            )?;

            if !found {
                // No existing ROOT_ITEM for this tree. This shouldn't normally
                // happen for trees that already existed, but skip gracefully.
                path.release();
                continue;
            }

            // Read the existing root item data, update it, write back
            let leaf = path.nodes[0].as_mut().ok_or_else(|| {
                io::Error::other("update_root_items: no leaf in path")
            })?;
            let slot = path.slots[0];
            let item_data = leaf.item_data(slot).to_vec();

            if let Some(mut root_item) = RootItem::parse(&item_data) {
                root_item.bytenr = new_bytenr;
                root_item.generation = self.transid;
                root_item.generation_v2 = self.transid;
                root_item.level = new_level;

                let new_data = root_item.to_bytes();
                if new_data.len() == item_data.len() {
                    items::update_item(leaf, slot, &new_data)?;
                    fs_info.mark_dirty(leaf);
                } else {
                    // Size mismatch (v1 vs v2 root item). Delete and
                    // reinsert with the correct size to avoid corruption.
                    items::del_items(leaf, slot, 1);
                    fs_info.mark_dirty(leaf);
                    path.release();

                    let mut path = BtrfsPath::new();
                    search::search_slot(
                        Some(&mut *self),
                        fs_info,
                        root_tree_id,
                        &key,
                        &mut path,
                        SearchIntent::Insert(
                            (ITEM_SIZE + new_data.len()) as u32,
                        ),
                        true,
                    )?;
                    let leaf = path.nodes[0].as_mut().ok_or_else(|| {
                        io::Error::other(
                            "update_root_items: no leaf after reinsert search",
                        )
                    })?;
                    items::insert_item(leaf, path.slots[0], &key, &new_data)?;
                    fs_info.mark_dirty(leaf);
                    path.release();
                    continue;
                }
            }

            path.release();
        }

        Ok(())
    }

    /// Flush delayed reference count updates to the extent tree.
    ///
    /// Drains the delayed ref queue and processes each net-nonzero delta.
    /// For positive deltas (new allocations), creates `METADATA_ITEM` entries
    /// with `TREE_BLOCK_REF` inline backrefs. For negative deltas (frees),
    /// deletes the extent item.
    ///
    /// Processing refs modifies the extent tree, which may generate more
    /// delayed refs from COW. Repeats until the queue is empty.
    #[allow(clippy::too_many_lines)]
    fn flush_delayed_refs(
        &mut self,
        fs_info: &mut Filesystem<R>,
    ) -> io::Result<()> {
        let skinny = fs_info.superblock.incompat_flags
            & u64::from(
                btrfs_disk::raw::BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA,
            )
            != 0;

        let extent_tree_id = 2u64;
        let nodesize = i64::from(fs_info.nodesize);

        // Load block groups once so we can map bytenr → block group.
        let block_groups = allocation::load_block_groups(fs_info)?;

        // Track per-block-group deltas: key is block group start address.
        let mut bg_deltas: BTreeMap<u64, i64> = BTreeMap::new();
        let mut bytes_used_delta: i64 = 0;

        // Convergence loop: drain and process until stable.
        // Processing refs modifies the extent tree, which COWs blocks and
        // generates more refs. Each iteration processes more refs than it
        // creates, so this converges.
        let max_iterations = 32;
        for iteration in 0..max_iterations {
            let refs = self.delayed_refs.drain();
            if refs.is_empty() {
                break;
            }

            for dref in refs {
                match dref.key {
                    DelayedRefKey::Metadata {
                        bytenr,
                        owner_root,
                        level,
                    } => {
                        if dref.delta > 0 {
                            self.create_metadata_extent(
                                fs_info,
                                extent_tree_id,
                                bytenr,
                                level,
                                owner_root,
                                skinny,
                            )?;
                            bytes_used_delta += nodesize;
                            if let Some(bg_start) = find_containing_block_group(
                                &block_groups,
                                bytenr,
                            ) {
                                *bg_deltas.entry(bg_start).or_insert(0) +=
                                    nodesize;
                                self.bg_range_deltas.record_allocated(
                                    bg_start,
                                    Range::new(bytenr, nodesize as u64),
                                );
                            }
                        } else if dref.delta < 0 {
                            self.delete_metadata_extent(
                                fs_info,
                                extent_tree_id,
                                bytenr,
                                level,
                                skinny,
                            )?;
                            bytes_used_delta -= nodesize;
                            if let Some(bg_start) = find_containing_block_group(
                                &block_groups,
                                bytenr,
                            ) {
                                *bg_deltas.entry(bg_start).or_insert(0) -=
                                    nodesize;
                                self.bg_range_deltas.record_freed(
                                    bg_start,
                                    Range::new(bytenr, nodesize as u64),
                                );
                            }
                        }
                    }
                    DelayedRefKey::Data {
                        bytenr,
                        owner_root,
                        owner_ino,
                        owner_offset,
                    } => {
                        let num_bytes = dref.num_bytes;
                        if num_bytes == 0 {
                            return Err(io::Error::other(
                                "data delayed ref missing num_bytes",
                            ));
                        }
                        if dref.delta > 0 {
                            let count = dref.delta as u32;
                            self.create_data_extent(
                                fs_info,
                                extent_tree_id,
                                bytenr,
                                num_bytes,
                                owner_root,
                                owner_ino,
                                owner_offset,
                                count,
                            )?;
                            let signed = num_bytes as i64;
                            bytes_used_delta += signed;
                            if let Some(bg_start) = find_containing_block_group(
                                &block_groups,
                                bytenr,
                            ) {
                                *bg_deltas.entry(bg_start).or_insert(0) +=
                                    signed;
                                self.bg_range_deltas.record_allocated(
                                    bg_start,
                                    Range::new(bytenr, num_bytes),
                                );
                            }
                        } else if dref.delta < 0 {
                            let refs_to_drop = (-dref.delta) as u32;
                            let new_total_refs = self.drop_data_extent_ref(
                                fs_info,
                                extent_tree_id,
                                bytenr,
                                num_bytes,
                                owner_root,
                                owner_ino,
                                owner_offset,
                                refs_to_drop,
                            )?;
                            if new_total_refs == 0 {
                                // Whole data extent has been freed.
                                self.delete_data_extent_item(
                                    fs_info,
                                    extent_tree_id,
                                    bytenr,
                                    num_bytes,
                                )?;
                                self.delete_csums_in_range(
                                    fs_info, bytenr, num_bytes,
                                )?;
                                let signed = num_bytes as i64;
                                bytes_used_delta -= signed;
                                if let Some(bg_start) =
                                    find_containing_block_group(
                                        &block_groups,
                                        bytenr,
                                    )
                                {
                                    *bg_deltas.entry(bg_start).or_insert(0) -=
                                        signed;
                                    self.bg_range_deltas.record_freed(
                                        bg_start,
                                        Range::new(bytenr, num_bytes),
                                    );
                                }
                            }
                        }
                    }
                }
            }

            if iteration == max_iterations - 1 && !self.delayed_refs.is_empty()
            {
                return Err(io::Error::other(
                    "delayed ref flush did not converge after 32 iterations",
                ));
            }
        }

        // Cancel ranges that were both allocated and freed within
        // this transaction. The FST sees neither.
        self.bg_range_deltas.cancel_within_transaction();

        // Update superblock bytes_used
        if bytes_used_delta != 0 {
            let current = fs_info.superblock.bytes_used as i64;
            fs_info.superblock.bytes_used = (current + bytes_used_delta) as u64;
        }

        // Update each affected block group's used field individually
        for (bg_start, delta) in &bg_deltas {
            if *delta != 0 {
                self.update_block_group_used(fs_info, *bg_start, *delta)?;
            }
        }

        Ok(())
    }

    /// Apply the per-block-group range deltas accumulated in
    /// `flush_delayed_refs` to the on-disk free space tree.
    ///
    /// For each block group with non-empty deltas:
    ///
    /// 1. Look up the block group's metadata (length).
    /// 2. Read the `FREE_SPACE_INFO` item; if its `BITMAPS` flag is
    ///    set, error out — bitmap layout is out of scope for v1.
    /// 3. Walk the existing `FREE_SPACE_EXTENT` items for this block
    ///    group and collect them into a sorted free-range list.
    /// 4. Apply the delta via [`free_space::apply_delta`] to produce
    ///    the new free-range list.
    /// 5. If unchanged, skip. Otherwise delete every existing
    ///    `FREE_SPACE_EXTENT` for this block group, insert the new
    ///    set, and update `FREE_SPACE_INFO.extent_count`.
    ///
    /// All FST modifications go through the standard COW search path,
    /// so they generate their own delayed refs and dirty blocks; the
    /// caller (the commit convergence loop) will pick those up on a
    /// subsequent pass.
    ///
    /// Returns `true` if any FST modifications were made.
    fn update_free_space_tree(
        &mut self,
        fs_info: &mut Filesystem<R>,
    ) -> io::Result<bool> {
        use crate::free_space::{Range, apply_delta};
        use btrfs_disk::items::FreeSpaceInfoFlags;

        let fst_id = 10u64;
        // Skip the FST update when the FREE_SPACE_TREE compat_ro flag
        // is cleared, regardless of whether a tree at id 10 exists on
        // disk. mkfs images built with `^free-space-tree` carry a
        // stale empty FST leaf today (mkfs PLAN B.2) — the kernel
        // ignores it because the flag is cleared, and so should we.
        // Also skip when the tree simply doesn't exist.
        let fst_flag =
            u64::from(btrfs_disk::raw::BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE);
        if fs_info.superblock.compat_ro_flags & fst_flag == 0
            || fs_info.root_bytenr(fst_id).is_none()
        {
            self.bg_range_deltas.clear();
            return Ok(false);
        }

        // Take ownership of the current deltas. Any new deltas
        // produced by FST COW during this call accumulate into
        // self.bg_range_deltas via the next flush_delayed_refs pass.
        let deltas = std::mem::take(&mut self.bg_range_deltas);
        if deltas.is_empty() {
            return Ok(false);
        }

        // Look up block group lengths once.
        let block_groups = allocation::load_block_groups(fs_info)?;
        let bg_len = |start: u64| -> Option<u64> {
            block_groups
                .iter()
                .find(|bg| bg.start == start)
                .map(|bg| bg.length)
        };

        let mut any_changes = false;

        for (bg_start, delta) in deltas.iter() {
            let bg_start = *bg_start;
            let bg_length = bg_len(bg_start).ok_or_else(|| {
                io::Error::other(format!(
                    "free space tree update: block group {bg_start} not found"
                ))
            })?;
            let bg = Range::new(bg_start, bg_length);

            // Step 1: read FREE_SPACE_INFO and check for bitmap layout.
            let info = self
                .read_free_space_info(fs_info, fst_id, bg_start, bg_length)?
                .ok_or_else(|| {
                    io::Error::other(format!(
                        "free space tree update: FREE_SPACE_INFO missing for block group {bg_start}"
                    ))
                })?;
            if info.flags.contains(FreeSpaceInfoFlags::USING_BITMAPS) {
                return Err(io::Error::other(format!(
                    "free space tree block group {bg_start} uses bitmap layout (unsupported in v1)"
                )));
            }

            // Step 2: read existing FREE_SPACE_EXTENT items.
            let existing = self.read_free_space_extents(
                fs_info, fst_id, bg_start, bg_length,
            )?;

            // Step 3: apply.
            let new = apply_delta(bg_start, bg, &existing, delta)
                .map_err(|e| io::Error::other(e.to_string()))?;

            if new == existing {
                continue;
            }

            // Step 4: delete all existing FREE_SPACE_EXTENT items for
            // this block group.
            self.delete_free_space_extents_in_range(
                fs_info, fst_id, bg_start, bg_length,
            )?;

            // Step 5: insert new FREE_SPACE_EXTENT items.
            for r in new.as_slice() {
                self.insert_free_space_extent(
                    fs_info, fst_id, r.start, r.length,
                )?;
            }

            // Step 6: update FREE_SPACE_INFO.extent_count.
            self.update_free_space_info_count(
                fs_info,
                fst_id,
                bg_start,
                bg_length,
                u32::try_from(new.len()).unwrap_or(u32::MAX),
                info.flags,
            )?;

            any_changes = true;
        }

        Ok(any_changes)
    }

    /// Read the `FREE_SPACE_INFO` item for a block group, if present.
    pub(crate) fn read_free_space_info(
        &mut self,
        fs_info: &mut Filesystem<R>,
        fst_id: u64,
        bg_start: u64,
        bg_length: u64,
    ) -> io::Result<Option<btrfs_disk::items::FreeSpaceInfo>> {
        use btrfs_disk::items::FreeSpaceInfo;

        let key = DiskKey {
            objectid: bg_start,
            key_type: KeyType::FreeSpaceInfo,
            offset: bg_length,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            fst_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            false,
        )?;
        if !found {
            path.release();
            return Ok(None);
        }
        let leaf = path.nodes[0].as_ref().ok_or_else(|| {
            io::Error::other("read_free_space_info: no leaf in path")
        })?;
        let slot = path.slots[0];
        let data = leaf.item_data(slot).to_vec();
        path.release();
        Ok(FreeSpaceInfo::parse(&data))
    }

    /// Walk every `FREE_SPACE_EXTENT` item whose objectid lies within
    /// `[bg_start, bg_start + bg_length)` and collect them into a
    /// sorted, coalesced [`RangeList`].
    fn read_free_space_extents(
        &mut self,
        fs_info: &mut Filesystem<R>,
        fst_id: u64,
        bg_start: u64,
        bg_length: u64,
    ) -> io::Result<crate::free_space::RangeList> {
        use crate::free_space::{Range, RangeList};

        let bg_end = bg_start + bg_length;
        let mut out: Vec<Range> = Vec::new();

        let key = DiskKey {
            objectid: bg_start,
            key_type: KeyType::FreeSpaceExtent,
            offset: 0,
        };
        let mut path = BtrfsPath::new();
        search::search_slot(
            Some(&mut *self),
            fs_info,
            fst_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            false,
        )?;

        while let Some(leaf) = path.nodes[0].as_ref() {
            let slot = path.slots[0];
            if slot >= leaf.nritems() as usize {
                if !search::next_leaf(fs_info, &mut path)? {
                    break;
                }
                continue;
            }
            let k = leaf.item_key(slot);
            if k.objectid >= bg_end {
                break;
            }
            if k.key_type == KeyType::FreeSpaceExtent && k.offset > 0 {
                out.push(Range::new(k.objectid, k.offset));
            }
            path.slots[0] = slot + 1;
        }

        path.release();

        // The walk is naturally sorted because the FST is keyed
        // (start, FREE_SPACE_EXTENT, length). Coalescing is a no-op on
        // a well-formed FST but harmless if the on-disk state somehow
        // contains touching ranges.
        let mut list = RangeList::new();
        for r in out {
            list.insert(r);
        }
        Ok(list)
    }

    /// Delete every `FREE_SPACE_EXTENT` item whose objectid lies within
    /// `[bg_start, bg_start + bg_length)`.
    fn delete_free_space_extents_in_range(
        &mut self,
        fs_info: &mut Filesystem<R>,
        fst_id: u64,
        bg_start: u64,
        bg_length: u64,
    ) -> io::Result<()> {
        let bg_end = bg_start + bg_length;
        loop {
            let key = DiskKey {
                objectid: bg_start,
                key_type: KeyType::FreeSpaceExtent,
                offset: 0,
            };
            let mut path = BtrfsPath::new();
            search::search_slot(
                Some(&mut *self),
                fs_info,
                fst_id,
                &key,
                &mut path,
                SearchIntent::Delete,
                true,
            )?;

            let Some(leaf) = path.nodes[0].as_mut() else {
                path.release();
                break;
            };
            let slot = path.slots[0];
            if slot >= leaf.nritems() as usize {
                path.release();
                break;
            }
            let k = leaf.item_key(slot);
            if k.key_type != KeyType::FreeSpaceExtent || k.objectid >= bg_end {
                path.release();
                break;
            }
            items::del_items(leaf, slot, 1);
            fs_info.mark_dirty(leaf);
            path.release();
        }
        Ok(())
    }

    /// Insert a single `FREE_SPACE_EXTENT` item with no payload.
    fn insert_free_space_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        fst_id: u64,
        start: u64,
        length: u64,
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: start,
            key_type: KeyType::FreeSpaceExtent,
            offset: length,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            fst_id,
            &key,
            &mut path,
            SearchIntent::Insert(ITEM_SIZE as u32),
            true,
        )?;
        if found {
            path.release();
            return Ok(());
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("insert_free_space_extent: no leaf in path")
        })?;
        let slot = path.slots[0];
        items::insert_item(leaf, slot, &key, &[])?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Update the `extent_count` field of an existing `FREE_SPACE_INFO`
    /// item, preserving its flag word.
    fn update_free_space_info_count(
        &mut self,
        fs_info: &mut Filesystem<R>,
        fst_id: u64,
        bg_start: u64,
        bg_length: u64,
        new_count: u32,
        flags: btrfs_disk::items::FreeSpaceInfoFlags,
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: bg_start,
            key_type: KeyType::FreeSpaceInfo,
            offset: bg_length,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            fst_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "update_free_space_info_count: FREE_SPACE_INFO missing for {bg_start}"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("update_free_space_info_count: no leaf in path")
        })?;
        let slot = path.slots[0];
        let mut data = Vec::with_capacity(8);
        data.extend_from_slice(&new_count.to_le_bytes());
        data.extend_from_slice(&flags.bits().to_le_bytes());
        items::update_item(leaf, slot, &data)?;
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Update a specific block group item's `used` field.
    ///
    /// `bg_start` is the logical start address of the block group (the key's
    /// objectid). The delta is applied to the current `used` value.
    fn update_block_group_used(
        &mut self,
        fs_info: &mut Filesystem<R>,
        bg_start: u64,
        bytes_delta: i64,
    ) -> io::Result<()> {
        use btrfs_disk::items::BlockGroupItem;

        // Block group items live in tree 11 (block group tree) or
        // tree 2 (extent tree). The routing may also be temporarily
        // pinned to tree 2 by the convert-to-block-group-tree path
        // while it builds the BGT, hence the accessor.
        let bg_tree_id = fs_info.block_group_tree_id();

        // Search for this block group by its start address.
        // Block group keys: (logical_offset, BLOCK_GROUP_ITEM, length)
        let search_key = DiskKey {
            objectid: bg_start,
            key_type: KeyType::BlockGroupItem,
            offset: 0,
        };

        let mut path = BtrfsPath::new();
        search::search_slot(
            Some(&mut *self),
            fs_info,
            bg_tree_id,
            &search_key,
            &mut path,
            SearchIntent::ReadOnly,
            true,
        )?;

        // Block group keys are (start, BLOCK_GROUP_ITEM, length). Our search
        // key uses offset=0, which is less than the actual key. So search_slot
        // lands at the block group item (first key >= our search key). Verify
        // the objectid matches.
        let Some(leaf) = path.nodes[0].as_mut() else {
            return Ok(());
        };
        let slot = path.slots[0];
        if slot >= leaf.nritems() as usize {
            path.release();
            return Ok(());
        }

        let item_key = leaf.item_key(slot);
        if item_key.key_type != KeyType::BlockGroupItem
            || item_key.objectid != bg_start
        {
            path.release();
            return Ok(());
        }

        // Read, update, and write back the block group item
        let data = leaf.item_data(slot).to_vec();
        if let Some(bg) = BlockGroupItem::parse(&data) {
            let new_used = (bg.used as i64 + bytes_delta).max(0) as u64;
            let new_data = BlockGroupItem {
                used: new_used,
                chunk_objectid: bg.chunk_objectid,
                flags: bg.flags,
            }
            .to_bytes();
            items::update_item(leaf, slot, &new_data)?;
            fs_info.mark_dirty(leaf);
        }

        path.release();
        Ok(())
    }

    /// Create a `METADATA_ITEM` (or `EXTENT_ITEM`) in the extent tree for a newly
    /// allocated tree block.
    fn create_metadata_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        extent_tree_id: u64,
        bytenr: u64,
        level: u8,
        owner: u64,
        skinny: bool,
    ) -> io::Result<()> {
        let key = if skinny {
            DiskKey {
                objectid: bytenr,
                key_type: KeyType::MetadataItem,
                offset: u64::from(level),
            }
        } else {
            DiskKey {
                objectid: bytenr,
                key_type: KeyType::ExtentItem,
                offset: u64::from(fs_info.nodesize),
            }
        };

        let data = if skinny {
            ExtentItem::to_bytes_skinny(1, self.transid, owner)
        } else {
            // Non-skinny format requires tree_block_info with the first
            // key and level of the referenced tree block.
            let first_key = if let Ok(eb) = fs_info.read_block(bytenr) {
                if eb.level() == 0 && eb.nritems() > 0 {
                    eb.item_key(0)
                } else if eb.level() > 0 && eb.nritems() > 0 {
                    eb.key_ptr_key(0)
                } else {
                    DiskKey {
                        objectid: 0,
                        key_type: KeyType::Unknown(0),
                        offset: 0,
                    }
                }
            } else {
                DiskKey {
                    objectid: 0,
                    key_type: KeyType::Unknown(0),
                    offset: 0,
                }
            };
            ExtentItem::to_bytes_non_skinny(
                1,
                self.transid,
                owner,
                &first_key,
                level,
            )
        };

        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            extent_tree_id,
            &key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + data.len()) as u32),
            true,
        )?;

        if found {
            // Extent item already exists (shouldn't happen for new allocations,
            // but handle gracefully by updating refcount)
            path.release();
            return Ok(());
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("create_metadata_extent: no leaf in path")
        })?;
        let slot = path.slots[0];

        items::insert_item(leaf, slot, &key, &data)?;
        fs_info.mark_dirty(leaf);
        path.release();

        Ok(())
    }

    /// Create an `EXTENT_ITEM` in the extent tree for a newly allocated data
    /// extent with a single inline `EXTENT_DATA_REF` backref.
    #[allow(clippy::too_many_arguments)]
    fn create_data_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        extent_tree_id: u64,
        bytenr: u64,
        num_bytes: u64,
        owner_root: u64,
        owner_ino: u64,
        owner_offset: u64,
        count: u32,
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: bytenr,
            key_type: KeyType::ExtentItem,
            offset: num_bytes,
        };

        let data = ExtentItem::to_bytes_data(
            u64::from(count),
            self.transid,
            owner_root,
            owner_ino,
            owner_offset,
            count,
        );
        debug_assert_eq!(data.len(), ExtentItem::DATA_INLINE_SIZE);

        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            extent_tree_id,
            &key,
            &mut path,
            SearchIntent::Insert((ITEM_SIZE + data.len()) as u32),
            true,
        )?;

        if found {
            // Extent item already exists. For v1 (mkfs), each data extent
            // has exactly one backref; duplicates shouldn't happen.
            debug_assert!(
                false,
                "create_data_extent: extent item already exists at {bytenr}"
            );
            path.release();
            return Ok(());
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("create_data_extent: no leaf in path")
        })?;
        let slot = path.slots[0];

        items::insert_item(leaf, slot, &key, &data)?;
        fs_info.mark_dirty(leaf);
        path.release();

        Ok(())
    }

    /// Delete a `METADATA_ITEM` (or `EXTENT_ITEM`) from the extent tree for a
    /// freed tree block.
    fn delete_metadata_extent(
        &mut self,
        fs_info: &mut Filesystem<R>,
        extent_tree_id: u64,
        bytenr: u64,
        level: u8,
        skinny: bool,
    ) -> io::Result<()> {
        let key = if skinny {
            DiskKey {
                objectid: bytenr,
                key_type: KeyType::MetadataItem,
                offset: u64::from(level),
            }
        } else {
            DiskKey {
                objectid: bytenr,
                key_type: KeyType::ExtentItem,
                offset: u64::from(fs_info.nodesize),
            }
        };

        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            extent_tree_id,
            &key,
            &mut path,
            SearchIntent::Delete,
            true,
        )?;

        if !found {
            // The old block may not have an extent item if it was allocated
            // before the transaction crate managed the extent tree. Skip.
            path.release();
            return Ok(());
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("delete_metadata_extent: no leaf in path")
        })?;
        let slot = path.slots[0];

        items::del_items(leaf, slot, 1);
        fs_info.mark_dirty(leaf);
        path.release();

        Ok(())
    }

    /// Drop a single `EXTENT_DATA_REF`-shaped backref from a data extent.
    ///
    /// Locates the matching backref either inline inside the
    /// `EXTENT_ITEM` or as a standalone `EXTENT_DATA_REF_KEY` item, then
    /// decrements (or removes) it and decrements `EXTENT_ITEM.refs` by
    /// `refs_to_drop`. Returns the new total `refs` value on the parent
    /// `EXTENT_ITEM`. The caller is responsible for freeing the data
    /// extent itself when this returns 0.
    #[allow(clippy::too_many_arguments)]
    fn drop_data_extent_ref(
        &mut self,
        fs_info: &mut Filesystem<R>,
        extent_tree_id: u64,
        bytenr: u64,
        num_bytes: u64,
        target_root: u64,
        target_ino: u64,
        target_offset: u64,
        refs_to_drop: u32,
    ) -> io::Result<u64> {
        // Step 1: locate the parent EXTENT_ITEM. Data extents always
        // use the non-skinny EXTENT_ITEM_KEY whose offset is num_bytes.
        let key = DiskKey {
            objectid: bytenr,
            key_type: KeyType::ExtentItem,
            offset: num_bytes,
        };

        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            extent_tree_id,
            &key,
            &mut path,
            SearchIntent::Delete,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "drop_data_extent_ref: EXTENT_ITEM not found at bytenr {bytenr} num_bytes {num_bytes}"
            )));
        }

        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("drop_data_extent_ref: no leaf in path")
        })?;
        let slot = path.slots[0];

        // Step 2: search the inline area for our backref.
        let location = locate_inline_data_ref(
            leaf,
            slot,
            target_root,
            target_ino,
            target_offset,
        )?;

        let new_total_refs = if let Some(loc) = location {
            // Inline path.
            let result =
                decrement_inline_data_ref(leaf, slot, &loc, refs_to_drop)?;
            fs_info.mark_dirty(leaf);
            result
        } else {
            // Step 2 didn't find an inline backref. Decrement the
            // parent EXTENT_ITEM.refs first while we still hold the
            // path, then release and walk to the standalone item.
            let item_data = leaf.item_data(slot);
            if item_data.len() < 24 {
                return Err(io::Error::other(
                    "drop_data_extent_ref: EXTENT_ITEM payload too short",
                ));
            }
            let mut current_refs =
                u64::from_le_bytes(item_data[0..8].try_into().unwrap());
            if u64::from(refs_to_drop) > current_refs {
                return Err(io::Error::other(
                    "drop_data_extent_ref: EXTENT_ITEM.refs underflow",
                ));
            }
            current_refs -= u64::from(refs_to_drop);
            leaf.item_data_mut(slot)[0..8]
                .copy_from_slice(&current_refs.to_le_bytes());
            fs_info.mark_dirty(leaf);
            path.release();

            self.drop_standalone_data_ref(
                fs_info,
                extent_tree_id,
                bytenr,
                target_root,
                target_ino,
                target_offset,
                refs_to_drop,
            )?;
            return Ok(current_refs);
        };

        path.release();
        Ok(new_total_refs)
    }

    /// Remove a standalone `EXTENT_DATA_REF_KEY` item from the extent
    /// tree. Walks forward through hash collisions until it finds the
    /// `(root, ino, offset)` triple matching the target.
    #[allow(clippy::too_many_arguments)]
    fn drop_standalone_data_ref(
        &mut self,
        fs_info: &mut Filesystem<R>,
        extent_tree_id: u64,
        bytenr: u64,
        target_root: u64,
        target_ino: u64,
        target_offset: u64,
        refs_to_drop: u32,
    ) -> io::Result<()> {
        use btrfs_disk::items::{ExtentDataRef, extent_data_ref_hash};

        let hash = extent_data_ref_hash(target_root, target_ino, target_offset);
        let key = DiskKey {
            objectid: bytenr,
            key_type: KeyType::ExtentDataRef,
            offset: hash,
        };
        let mut path = BtrfsPath::new();
        search::search_slot(
            Some(&mut *self),
            fs_info,
            extent_tree_id,
            &key,
            &mut path,
            SearchIntent::Delete,
            true,
        )?;

        loop {
            let leaf = path.nodes[0].as_mut().ok_or_else(|| {
                io::Error::other("drop_standalone_data_ref: no leaf in path")
            })?;
            let nritems = leaf.nritems() as usize;
            if path.slots[0] >= nritems {
                if !search::next_leaf(fs_info, &mut path)? {
                    path.release();
                    return Err(io::Error::other(
                        "drop_standalone_data_ref: ran out of leaves",
                    ));
                }
                continue;
            }
            let slot = path.slots[0];
            let item_key = leaf.item_key(slot);
            if item_key.objectid != bytenr
                || item_key.key_type != KeyType::ExtentDataRef
            {
                path.release();
                return Err(io::Error::other(format!(
                    "drop_standalone_data_ref: triple ({target_root},{target_ino},{target_offset}) not found at bytenr {bytenr}"
                )));
            }

            let payload = leaf.item_data(slot).to_vec();
            let parsed = ExtentDataRef::parse(&payload).ok_or_else(|| {
                io::Error::other(
                    "drop_standalone_data_ref: malformed EXTENT_DATA_REF",
                )
            })?;
            if parsed.root == target_root
                && parsed.objectid == target_ino
                && parsed.offset == target_offset
            {
                if refs_to_drop > parsed.count {
                    return Err(io::Error::other(
                        "drop_standalone_data_ref: count underflow",
                    ));
                }
                let new_count = parsed.count - refs_to_drop;
                if new_count == 0 {
                    items::del_items(leaf, slot, 1);
                } else {
                    let mut new_payload = payload.clone();
                    new_payload[24..28]
                        .copy_from_slice(&new_count.to_le_bytes());
                    items::update_item(leaf, slot, &new_payload)?;
                }
                fs_info.mark_dirty(leaf);
                path.release();
                return Ok(());
            }

            // Hash collision: advance and retry.
            path.slots[0] = slot + 1;
        }
    }

    /// Remove the `EXTENT_ITEM` for a fully-freed data extent.
    fn delete_data_extent_item(
        &mut self,
        fs_info: &mut Filesystem<R>,
        extent_tree_id: u64,
        bytenr: u64,
        num_bytes: u64,
    ) -> io::Result<()> {
        let key = DiskKey {
            objectid: bytenr,
            key_type: KeyType::ExtentItem,
            offset: num_bytes,
        };
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            Some(&mut *self),
            fs_info,
            extent_tree_id,
            &key,
            &mut path,
            SearchIntent::Delete,
            true,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "delete_data_extent_item: EXTENT_ITEM missing at {bytenr}"
            )));
        }
        let leaf = path.nodes[0].as_mut().ok_or_else(|| {
            io::Error::other("delete_data_extent_item: no leaf in path")
        })?;
        let slot = path.slots[0];
        items::del_items(leaf, slot, 1);
        fs_info.mark_dirty(leaf);
        path.release();
        Ok(())
    }

    /// Remove csum coverage for `[bytenr, bytenr + num_bytes)` from the
    /// csum tree.
    ///
    /// Csum items pack one or more sector csums into a single item
    /// keyed by the logical start offset. A freed data extent may
    /// occupy any contiguous span of sectors inside such an item, so
    /// this helper supports three cases per overlapping csum item:
    ///
    /// - Entirely contained in the freed range → delete the item.
    /// - Freed range strictly inside the item → split into a leading
    ///   and trailing csum item.
    /// - One side trimmed → delete and re-insert one shorter item.
    #[allow(clippy::too_many_lines, clippy::items_after_statements)]
    fn delete_csums_in_range(
        &mut self,
        fs_info: &mut Filesystem<R>,
        bytenr: u64,
        num_bytes: u64,
    ) -> io::Result<()> {
        let csum_tree_id = 7u64;
        if fs_info.root_bytenr(csum_tree_id).is_none() {
            return Ok(());
        }

        // What survives a partial overlap.
        struct Surviving {
            key: DiskKey,
            payload: Vec<u8>,
        }
        // What we plan to do to one csum item.
        struct CsumOp {
            old_key: DiskKey,
            // Up to two surviving sub-items (head and/or tail). Empty
            // means whole-item deletion.
            survivors: Vec<Surviving>,
        }

        // BTRFS_EXTENT_CSUM_OBJECTID == -10 in i64 ==
        // 0xFFFF_FFFF_FFFF_FFF6. The constant binds as i32 in raw, so
        // sign-extend through i64.
        let csum_objectid =
            i64::from(btrfs_disk::raw::BTRFS_EXTENT_CSUM_OBJECTID) as u64;
        let sectorsize = u64::from(fs_info.superblock.sectorsize);
        // v1 only supports CRC32C filesystems (4-byte csums). Other csum
        // types are not produced by mkfs in this codebase.
        let csum_size: u64 = 4;
        let end = bytenr + num_bytes;

        // Pass 1: walk the csum tree once and collect every operation
        // (full delete or trim/split). Done as a read-only walk so we
        // never hold an &mut borrow on `path` across calls.
        let mut ops: Vec<CsumOp> = Vec::new();
        {
            // Start at the largest key whose offset <= bytenr.
            let start_key = DiskKey {
                objectid: csum_objectid,
                key_type: KeyType::ExtentCsum,
                offset: bytenr,
            };
            let mut path = BtrfsPath::new();
            let found = search::search_slot(
                Some(&mut *self),
                fs_info,
                csum_tree_id,
                &start_key,
                &mut path,
                SearchIntent::ReadOnly,
                false,
            )?;
            if !found && path.slots[0] > 0 {
                path.slots[0] -= 1;
            }

            'walk: while let Some(leaf) = path.nodes[0].as_ref() {
                let nritems = leaf.nritems() as usize;
                if path.slots[0] >= nritems {
                    if !search::next_leaf(fs_info, &mut path)? {
                        break;
                    }
                    continue;
                }
                let slot = path.slots[0];
                let item_key = leaf.item_key(slot);
                if item_key.objectid != csum_objectid
                    || item_key.key_type != KeyType::ExtentCsum
                {
                    // For the very first iteration we may have backed
                    // up onto a non-csum item; advance once and retry
                    // the type check before bailing.
                    if ops.is_empty() {
                        path.slots[0] = slot + 1;
                        continue;
                    }
                    break 'walk;
                }
                let item_size = u64::from(leaf.item_size(slot));
                let csum_start = item_key.offset;
                let sectors = item_size / csum_size;
                let csum_end = csum_start + sectors * sectorsize;

                if csum_end <= bytenr {
                    path.slots[0] = slot + 1;
                    continue;
                }
                if csum_start >= end {
                    break 'walk;
                }

                // Compute up-to-two surviving sub-items: head before
                // bytenr, tail after end. Sectors fully inside the
                // freed range are dropped. The freed range and csum
                // item are both sectorsize-aligned by construction.
                let payload = leaf.item_data(slot).to_vec();
                let mut survivors: Vec<Surviving> = Vec::new();

                if csum_start < bytenr {
                    let head_sectors =
                        ((bytenr - csum_start) / sectorsize) as usize;
                    let head_bytes = head_sectors * csum_size as usize;
                    survivors.push(Surviving {
                        key: DiskKey {
                            objectid: csum_objectid,
                            key_type: KeyType::ExtentCsum,
                            offset: csum_start,
                        },
                        payload: payload[..head_bytes].to_vec(),
                    });
                }
                if csum_end > end {
                    let skipped_sectors =
                        ((end - csum_start) / sectorsize) as usize;
                    let tail_start_bytes = skipped_sectors * csum_size as usize;
                    let tail_byte_count = (sectors as usize - skipped_sectors)
                        * csum_size as usize;
                    survivors.push(Surviving {
                        key: DiskKey {
                            objectid: csum_objectid,
                            key_type: KeyType::ExtentCsum,
                            offset: end,
                        },
                        payload: payload[tail_start_bytes
                            ..tail_start_bytes + tail_byte_count]
                            .to_vec(),
                    });
                }

                ops.push(CsumOp {
                    old_key: item_key,
                    survivors,
                });
                path.slots[0] = slot + 1;
            }
            path.release();
        }

        // Pass 2: apply each collected op. Re-search per item because
        // earlier mutations may have COWed leaves and shifted slots.
        for op in ops {
            // Delete the original item.
            let mut path = BtrfsPath::new();
            let found = search::search_slot(
                Some(&mut *self),
                fs_info,
                csum_tree_id,
                &op.old_key,
                &mut path,
                SearchIntent::Delete,
                true,
            )?;
            if found {
                let leaf = path.nodes[0].as_mut().ok_or_else(|| {
                    io::Error::other("delete_csums_in_range: no leaf in path")
                })?;
                items::del_items(leaf, path.slots[0], 1);
                fs_info.mark_dirty(leaf);
            }
            path.release();

            // Insert any surviving fragments.
            for sv in op.survivors {
                if sv.payload.is_empty() {
                    continue;
                }
                let mut path = BtrfsPath::new();
                let found = search::search_slot(
                    Some(&mut *self),
                    fs_info,
                    csum_tree_id,
                    &sv.key,
                    &mut path,
                    SearchIntent::Insert((ITEM_SIZE + sv.payload.len()) as u32),
                    true,
                )?;
                if found {
                    path.release();
                    continue;
                }
                let leaf = path.nodes[0].as_mut().ok_or_else(|| {
                    io::Error::other(
                        "delete_csums_in_range: no leaf for insert",
                    )
                })?;
                items::insert_item(leaf, path.slots[0], &sv.key, &sv.payload)?;
                fs_info.mark_dirty(leaf);
                path.release();
            }
        }
        Ok(())
    }

    /// Make sure the SYSTEM chunk containing `logical` is registered
    /// in the superblock's `sys_chunk_array` bootstrap snippet.
    ///
    /// At mount time the kernel knows the chunk tree's root bytenr but
    /// has no way to resolve it to a physical offset until it can read
    /// chunk items — and chunk items live in the chunk tree itself. The
    /// circular dependency is broken by the `sys_chunk_array` byte
    /// buffer in the superblock, which embeds the chunk records for
    /// every system chunk. Whenever the chunk tree COWs into a system
    /// chunk that is not yet in that snippet, we must add it.
    ///
    /// On filesystems where `logical` already falls inside a system
    /// chunk that is part of the snippet, this is a no-op.
    fn ensure_in_sys_chunk_array(
        &mut self,
        fs_info: &mut Filesystem<R>,
        logical: u64,
    ) -> io::Result<()> {
        // clippy
        let _ = self;

        // Locate the system block group containing this logical address.
        let groups = allocation::load_block_groups(fs_info)?;
        let bg = groups
            .iter()
            .find(|g| {
                g.flags.contains(BlockGroupFlags::SYSTEM)
                    && logical >= g.start
                    && logical < g.start + g.length
            })
            .ok_or_else(|| {
                io::Error::other(format!(
                    "ensure_in_sys_chunk_array: no SYSTEM block group contains {logical}"
                ))
            })?;
        let bg_start = bg.start;

        // Already in the bootstrap snippet?
        if sys_chunk_array_contains(
            &fs_info.superblock.sys_chunk_array,
            fs_info.superblock.sys_chunk_array_size,
            bg_start,
        ) {
            return Ok(());
        }

        // Read the corresponding CHUNK_ITEM from the chunk tree.
        let key = DiskKey {
            objectid: u64::from(
                btrfs_disk::raw::BTRFS_FIRST_CHUNK_TREE_OBJECTID,
            ),
            key_type: KeyType::ChunkItem,
            offset: bg_start,
        };
        let chunk_tree_id =
            u64::from(btrfs_disk::raw::BTRFS_CHUNK_TREE_OBJECTID);
        let mut path = BtrfsPath::new();
        let found = search::search_slot(
            None,
            fs_info,
            chunk_tree_id,
            &key,
            &mut path,
            SearchIntent::ReadOnly,
            false,
        )?;
        if !found {
            path.release();
            return Err(io::Error::other(format!(
                "ensure_in_sys_chunk_array: CHUNK_ITEM missing for bg {bg_start}"
            )));
        }
        let leaf = path.nodes[0].as_ref().ok_or_else(|| {
            io::Error::other("ensure_in_sys_chunk_array: no leaf in path")
        })?;
        let item_data = leaf.item_data(path.slots[0]).to_vec();
        path.release();

        // Reparse the chunk and re-serialize via the clean-room helper
        // (the on-disk bytes are equivalent, but going through the
        // ChunkMapping round-trip keeps this independent of the chunk
        // tree's exact storage format and lets future changes plug in
        // here in one place).
        let (mapping, _) =
            parse_chunk_item(&item_data, bg_start).ok_or_else(|| {
                io::Error::other(
                    "ensure_in_sys_chunk_array: malformed CHUNK_ITEM",
                )
            })?;
        let chunk_bytes =
            chunk_item_bytes(&mapping, fs_info.superblock.sectorsize);

        let new_size = sys_chunk_array_append(
            &mut fs_info.superblock.sys_chunk_array,
            &mut fs_info.superblock.sys_chunk_array_size,
            bg_start,
            &chunk_bytes,
        )
        .map_err(|e| {
            io::Error::other(format!("ensure_in_sys_chunk_array: {e}"))
        })?;
        debug_assert!(new_size > 0);
        Ok(())
    }

    /// Rebuild free space tree entries by scanning the extent tree.
    ///
    /// For each block group, computes free ranges from the extent tree and
    /// rewrites the `FREE_SPACE_EXTENT` and `FREE_SPACE_INFO` items. This is
    /// simpler and more robust than incremental updates because it doesn't
    /// have convergence issues.
    #[allow(dead_code)]
    fn rebuild_free_space_tree(
        &mut self,
        fs_info: &mut Filesystem<R>,
    ) -> io::Result<()> {
        use crate::allocation;
        use btrfs_disk::items::FreeSpaceInfo;

        let fst_id = 10u64;
        if fs_info.root_bytenr(fst_id).is_none() {
            return Ok(());
        }
        let groups = allocation::load_block_groups(fs_info)?;

        for bg in &groups {
            // Find free ranges within this block group
            let free_ranges =
                allocation::find_free_extents(fs_info, bg.start, bg.length, 1)?;

            // Delete existing FREE_SPACE_EXTENT items for this block group
            self.delete_free_space_extents(
                fs_info, fst_id, bg.start, bg.length,
            )?;

            // Insert new FREE_SPACE_EXTENT items
            for &(start, len) in &free_ranges {
                let key = DiskKey {
                    objectid: start,
                    key_type: KeyType::FreeSpaceExtent,
                    offset: len,
                };
                let mut path = BtrfsPath::new();
                let found = search::search_slot(
                    Some(&mut *self),
                    fs_info,
                    fst_id,
                    &key,
                    &mut path,
                    SearchIntent::Insert(ITEM_SIZE as u32),
                    true,
                )?;
                if !found {
                    let leaf = path.nodes[0].as_mut().unwrap();
                    items::insert_item(leaf, path.slots[0], &key, &[])?;
                    fs_info.mark_dirty(leaf);
                }
                path.release();
            }

            // Update FREE_SPACE_INFO for this block group
            let info_key = DiskKey {
                objectid: bg.start,
                key_type: KeyType::FreeSpaceInfo,
                offset: bg.length,
            };
            let mut path = BtrfsPath::new();
            let found = search::search_slot(
                Some(&mut *self),
                fs_info,
                fst_id,
                &info_key,
                &mut path,
                SearchIntent::ReadOnly,
                true,
            )?;
            if found {
                let leaf = path.nodes[0].as_mut().unwrap();
                let slot = path.slots[0];
                let data = leaf.item_data(slot).to_vec();
                if let Some(info) = FreeSpaceInfo::parse(&data) {
                    // Update extent_count, preserve flags
                    let mut new_data = Vec::with_capacity(8);
                    new_data.extend_from_slice(
                        &(free_ranges.len() as u32).to_le_bytes(),
                    );
                    new_data
                        .extend_from_slice(&info.flags.bits().to_le_bytes());
                    items::update_item(leaf, slot, &new_data)?;
                    fs_info.mark_dirty(leaf);
                }
            }
            path.release();
        }

        Ok(())
    }

    /// Delete all `FREE_SPACE_EXTENT` items within a block group's range.
    #[allow(dead_code)]
    fn delete_free_space_extents(
        &mut self,
        fs_info: &mut Filesystem<R>,
        fst_id: u64,
        bg_start: u64,
        bg_length: u64,
    ) -> io::Result<()> {
        let bg_end = bg_start + bg_length;

        // Search for the first key >= bg_start with type FREE_SPACE_EXTENT
        let search_key = DiskKey {
            objectid: bg_start,
            key_type: KeyType::FreeSpaceExtent,
            offset: 0,
        };

        loop {
            let mut path = BtrfsPath::new();
            let _found = search::search_slot(
                Some(&mut *self),
                fs_info,
                fst_id,
                &search_key,
                &mut path,
                SearchIntent::Delete,
                true,
            )?;

            let Some(leaf) = path.nodes[0].as_mut() else {
                break;
            };
            let slot = path.slots[0];
            if slot >= leaf.nritems() as usize {
                path.release();
                break;
            }

            let key = leaf.item_key(slot);
            if key.key_type != KeyType::FreeSpaceExtent
                || key.objectid >= bg_end
            {
                path.release();
                break;
            }

            items::del_items(leaf, slot, 1);
            fs_info.mark_dirty(leaf);
            path.release();
            // Loop to find and delete the next one
        }

        Ok(())
    }

    /// Update the free space tree to account for specific allocated blocks.
    /// For each block, find the containing `FREE_SPACE_EXTENT` and shrink
    /// or split it.
    #[allow(dead_code)]
    fn update_free_space_tree_for(
        &mut self,
        fs_info: &mut Filesystem<R>,
        allocated: &[u64],
    ) -> io::Result<()> {
        let fst_id = 10u64;
        if fs_info.root_bytenr(fst_id).is_none() {
            return Ok(()); // No free space tree
        }

        let nodesize = u64::from(fs_info.nodesize);

        for &addr in allocated {
            // Search for a FREE_SPACE_EXTENT containing this address.
            // Key: (start, FREE_SPACE_EXTENT=199, length)
            // We search for the largest key <= addr with type 199.
            let search_key = DiskKey {
                objectid: addr,
                key_type: KeyType::FreeSpaceExtent,
                offset: u64::MAX,
            };

            let mut path = BtrfsPath::new();
            let found = search::search_slot(
                Some(&mut *self),
                fs_info,
                fst_id,
                &search_key,
                &mut path,
                SearchIntent::Delete,
                true,
            )?;

            // If not exact match, back up one slot
            if !found && path.slots[0] > 0 {
                path.slots[0] -= 1;
            }

            let Some(leaf) = path.nodes[0].as_mut() else {
                path.release();
                continue;
            };
            let slot = path.slots[0];
            if slot >= leaf.nritems() as usize {
                path.release();
                continue;
            }

            let item_key = leaf.item_key(slot);
            if item_key.key_type != KeyType::FreeSpaceExtent {
                path.release();
                continue;
            }

            let extent_start = item_key.objectid;
            let extent_len = item_key.offset;
            let extent_end = extent_start + extent_len;

            // Check if this free extent contains our allocation
            if addr < extent_start || addr + nodesize > extent_end {
                path.release();
                continue;
            }

            // Delete the old free space extent
            items::del_items(leaf, slot, 1);
            fs_info.mark_dirty(leaf);
            path.release();

            // Insert replacement extent(s)
            if addr > extent_start {
                // Left portion: (extent_start, addr - extent_start)
                let left_key = DiskKey {
                    objectid: extent_start,
                    key_type: KeyType::FreeSpaceExtent,
                    offset: addr - extent_start,
                };
                let mut path = BtrfsPath::new();
                search::search_slot(
                    Some(&mut *self),
                    fs_info,
                    fst_id,
                    &left_key,
                    &mut path,
                    SearchIntent::Insert(ITEM_SIZE as u32),
                    true,
                )?;
                let leaf = path.nodes[0].as_mut().unwrap();
                items::insert_item(leaf, path.slots[0], &left_key, &[])?;
                fs_info.mark_dirty(leaf);
                path.release();
            }

            let after = addr + nodesize;
            if after < extent_end {
                // Right portion: (addr + nodesize, extent_end - after)
                let right_key = DiskKey {
                    objectid: after,
                    key_type: KeyType::FreeSpaceExtent,
                    offset: extent_end - after,
                };
                let mut path = BtrfsPath::new();
                search::search_slot(
                    Some(&mut *self),
                    fs_info,
                    fst_id,
                    &right_key,
                    &mut path,
                    SearchIntent::Insert(ITEM_SIZE as u32),
                    true,
                )?;
                let leaf = path.nodes[0].as_mut().unwrap();
                items::insert_item(leaf, path.slots[0], &right_key, &[])?;
                fs_info.mark_dirty(leaf);
                path.release();
            }

            // Update FREE_SPACE_INFO extent_count for this block group.
            // For a simple allocation from the middle of an extent:
            // count changes by +1 (one extent becomes two) or -1 (exact match
            // removes one) or 0 (trim from edge). Skip for now — the kernel
            // rebuilds this on mount when VALID is cleared.
        }

        Ok(())
    }

    /// Abort the transaction: discard all dirty blocks without writing.
    pub fn abort(self, fs_info: &mut Filesystem<R>) {
        fs_info.generation = fs_info.superblock.generation;
        fs_info.clear_dirty();
        fs_info.clear_cache();
        // Roll back any in-memory `set_root_bytenr` calls made during
        // the transaction. Without this, the roots map keeps pointing
        // at COWed-but-never-written bytenrs, and the next transaction
        // will read garbage from disk.
        fs_info.restore_roots_from_snapshot();
    }
}

/// Position of one inline backref inside an `EXTENT_ITEM` payload.
#[derive(Debug, Clone, Copy)]
struct InlineRefLocation {
    /// Offset of the inline ref's first byte (the type tag) inside
    /// the item payload.
    inline_offset: usize,
    /// Total size of the inline ref including its type tag.
    inline_size: usize,
    /// Current `count` field for the matched `EXTENT_DATA_REF` record.
    current_count: u32,
}

/// Walk the inline-backref area of an `EXTENT_ITEM` looking for an
/// `EXTENT_DATA_REF` whose `(root, ino, offset)` triple matches the
/// target. Returns `Ok(None)` if the backref is not stored inline; the
/// caller should then look for a standalone `EXTENT_DATA_REF_KEY`
/// item.
fn locate_inline_data_ref(
    leaf: &crate::buffer::ExtentBuffer,
    slot: usize,
    target_root: u64,
    target_ino: u64,
    target_offset: u64,
) -> io::Result<Option<InlineRefLocation>> {
    use btrfs_disk::items::{extent_data_ref_hash, inline_ref_size};

    let item_key = leaf.item_key(slot);
    let payload = leaf.item_data(slot);
    if payload.len() < 24 {
        return Err(io::Error::other(
            "locate_inline_data_ref: EXTENT_ITEM payload too short",
        ));
    }
    let flags = u64::from_le_bytes(payload[16..24].try_into().unwrap());
    let is_tree_block =
        flags & u64::from(btrfs_disk::raw::BTRFS_EXTENT_FLAG_TREE_BLOCK) != 0;

    // Skip header (24) + optional tree_block_info (18 bytes when this
    // is a non-skinny tree-block extent, i.e. EXTENT_ITEM_KEY).
    let mut cursor = 24usize;
    if is_tree_block && item_key.key_type == KeyType::ExtentItem {
        cursor += 18;
    }
    if cursor > payload.len() {
        return Err(io::Error::other(
            "locate_inline_data_ref: header overruns payload",
        ));
    }

    let target_hash =
        extent_data_ref_hash(target_root, target_ino, target_offset);
    let edr_type = btrfs_disk::raw::BTRFS_EXTENT_DATA_REF_KEY as u8;

    while cursor < payload.len() {
        let type_byte = payload[cursor];
        let size = inline_ref_size(type_byte).ok_or_else(|| {
            io::Error::other(format!(
                "locate_inline_data_ref: unknown inline ref type {type_byte}"
            ))
        })?;
        if cursor + size > payload.len() {
            return Err(io::Error::other(
                "locate_inline_data_ref: inline ref overruns payload",
            ));
        }

        if type_byte < edr_type {
            cursor += size;
            continue;
        }
        if type_byte > edr_type {
            // Past the EXTENT_DATA_REF range; the target isn't inline.
            return Ok(None);
        }

        // EXTENT_DATA_REF inline record:
        //   1 byte type tag, then btrfs_extent_data_ref (28 bytes):
        //   u64 root, u64 objectid, u64 offset, u32 count.
        let body = &payload[cursor + 1..cursor + 1 + 28];
        let r = u64::from_le_bytes(body[0..8].try_into().unwrap());
        let o = u64::from_le_bytes(body[8..16].try_into().unwrap());
        let off = u64::from_le_bytes(body[16..24].try_into().unwrap());
        let count = u32::from_le_bytes(body[24..28].try_into().unwrap());

        if r == target_root && o == target_ino && off == target_offset {
            return Ok(Some(InlineRefLocation {
                inline_offset: cursor,
                inline_size: size,
                current_count: count,
            }));
        }

        // Hash collision OR adjacent EDR record. Inline EDR records are
        // ordered by extent_data_ref_hash; if we've already passed the
        // target hash, the target is not inline.
        let here_hash = extent_data_ref_hash(r, o, off);
        if here_hash > target_hash {
            return Ok(None);
        }
        cursor += size;
    }

    Ok(None)
}

/// Decrement (or remove) an inline `EXTENT_DATA_REF` and the parent
/// `EXTENT_ITEM.refs` count by `refs_to_drop`. Returns the new total
/// `EXTENT_ITEM.refs` value.
fn decrement_inline_data_ref(
    leaf: &mut crate::buffer::ExtentBuffer,
    slot: usize,
    location: &InlineRefLocation,
    refs_to_drop: u32,
) -> io::Result<u64> {
    if refs_to_drop > location.current_count {
        return Err(io::Error::other(
            "decrement_inline_data_ref: count underflow",
        ));
    }

    // Step 1: decrement EXTENT_ITEM.refs at offset 0..8.
    let item_data = leaf.item_data(slot);
    let mut current_refs =
        u64::from_le_bytes(item_data[0..8].try_into().unwrap());
    if u64::from(refs_to_drop) > current_refs {
        return Err(io::Error::other(
            "decrement_inline_data_ref: EXTENT_ITEM.refs underflow",
        ));
    }
    current_refs -= u64::from(refs_to_drop);
    leaf.item_data_mut(slot)[0..8].copy_from_slice(&current_refs.to_le_bytes());

    let new_count = location.current_count - refs_to_drop;
    if new_count > 0 {
        // Just rewrite the count field of the inline ref in place.
        // Inline EDR layout: [type=1B][root=8B][oid=8B][off=8B][count=4B]
        let count_off = location.inline_offset + 1 + 24;
        leaf.item_data_mut(slot)[count_off..count_off + 4]
            .copy_from_slice(&new_count.to_le_bytes());
        return Ok(current_refs);
    }

    // Remove the entire inline ref. First memmove the bytes after the
    // ref left within the item payload, then shrink the item by
    // `inline_size`.
    let item_size = leaf.item_size(slot) as usize;
    let after_off = location.inline_offset + location.inline_size;
    if after_off < item_size {
        let payload = leaf.item_data_mut(slot);
        payload.copy_within(after_off..item_size, location.inline_offset);
    }
    items::shrink_item(leaf, slot, location.inline_size as u32)?;

    Ok(current_refs)
}

/// Find a free region inside a block group of the requested kind,
/// starting at or after `min_addr`.
///
/// `alignment` constrains the start address; `min_size` is the minimum
/// contiguous span of free space required. For metadata both are
/// `nodesize`; for data, `alignment` is `sectorsize` and `min_size` is
/// the requested data extent length.
///
/// Uses extent-tree free space scanning to find actual gaps between
/// allocated extents. Returns `(first_free_logical, region_end)`.
fn find_alloc_region_after<R: Read + Write + Seek>(
    fs_info: &mut Filesystem<R>,
    kind: BlockGroupKind,
    min_addr: u64,
    alignment: u64,
    min_size: u64,
) -> io::Result<(u64, u64)> {
    use crate::allocation;

    let groups = allocation::load_block_groups(fs_info)?;

    let kind_matches = |bg: &&allocation::BlockGroup| match kind {
        BlockGroupKind::Metadata => bg.is_metadata(),
        BlockGroupKind::System => bg.is_system(),
        BlockGroupKind::Data => bg.is_data(),
    };

    let mut candidates: Vec<&allocation::BlockGroup> = groups
        .iter()
        .filter(kind_matches)
        .filter(|bg| bg.free() >= min_size)
        .collect();
    candidates.sort_by_key(|bg| std::cmp::Reverse(bg.free()));

    for bg in candidates {
        let free_extents = allocation::find_free_extents(
            fs_info, bg.start, bg.length, min_size,
        )?;

        for &(start, len) in &free_extents {
            let cursor = align_up(start.max(min_addr), alignment);
            let end = start + len;
            if cursor + min_size <= end {
                return Ok((cursor, end));
            }
        }
    }

    Err(io::Error::other(format!(
        "no {kind:?} block group with free space",
    )))
}

/// Update one backup root slot in the superblock.
///
/// This is currently a placeholder. The full implementation needs to populate
/// the backup root fields from the current tree root state.
/// Populate one backup root slot from the current filesystem state.
///
/// The superblock has 4 rotating backup root entries. On each commit, one
/// slot is overwritten (cycling 0 -> 1 -> 2 -> 3 -> 0). Each entry
/// captures the root pointers, generations, and levels of the 6 core trees
/// plus filesystem size counters.
fn update_backup_root<R: Read + Write + Seek>(
    fs_info: &mut Filesystem<R>,
    slot: usize,
) {
    use btrfs_disk::superblock::BackupRoot;

    /// Read the generation and level of a tree's root block, returning
    /// (bytenr, generation, level). Falls back to (0, 0, 0) if unavailable.
    fn root_info<R: Read + Write + Seek>(
        fs_info: &mut Filesystem<R>,
        tree_id: u64,
    ) -> (u64, u64, u8) {
        let bytenr = fs_info.root_bytenr(tree_id).unwrap_or(0);
        if bytenr == 0 {
            return (0, 0, 0);
        }
        match fs_info.read_block(bytenr) {
            Ok(eb) => (bytenr, eb.generation(), eb.level()),
            Err(_) => (bytenr, 0, 0),
        }
    }

    let (tree_root, tree_root_gen, tree_root_level) = root_info(fs_info, 1);
    let (chunk_root, chunk_root_gen, chunk_root_level) = root_info(fs_info, 3);
    let (extent_root, extent_root_gen, extent_root_level) =
        root_info(fs_info, 2);
    let (fs_root, fs_root_gen, fs_root_level) = root_info(fs_info, 5);
    let (dev_root, dev_root_gen, dev_root_level) = root_info(fs_info, 4);
    let (csum_root, csum_root_gen, csum_root_level) = root_info(fs_info, 7);

    fs_info.superblock.backup_roots[slot] = BackupRoot {
        tree_root,
        tree_root_gen,
        chunk_root,
        chunk_root_gen,
        extent_root,
        extent_root_gen,
        fs_root,
        fs_root_gen,
        dev_root,
        dev_root_gen,
        csum_root,
        csum_root_gen,
        total_bytes: fs_info.superblock.total_bytes,
        bytes_used: fs_info.superblock.bytes_used,
        num_devices: fs_info.superblock.num_devices,
        tree_root_level,
        chunk_root_level,
        extent_root_level,
        fs_root_level,
        dev_root_level,
        csum_root_level,
    };
}

/// Find which block group contains a given logical byte address.
///
/// Returns the block group's start address, or `None` if the address
/// doesn't fall within any known block group.
fn find_containing_block_group(
    groups: &[allocation::BlockGroup],
    bytenr: u64,
) -> Option<u64> {
    groups
        .iter()
        .find(|bg| bytenr >= bg.start && bytenr < bg.start + bg.length)
        .map(|bg| bg.start)
}

/// Align a value up to the given alignment.
const fn align_up(value: u64, align: u64) -> u64 {
    (value + align - 1) & !(align - 1)
}

/// Attempt to compress `data` with the requested algorithm in the
/// inline `EXTENT_DATA` format.
///
/// Returns `Some(payload)` only if the result is strictly smaller than
/// the input — callers should fall back to writing the raw bytes
/// otherwise. Returns `None` for `CompressionType::None`,
/// `CompressionType::Unknown(_)`, or empty input.
///
/// For zlib and zstd the payload is the raw compressor output. For LZO
/// the payload is the inline framing format
/// `[4B total_len LE] [4B seg_len LE] [lzo1x compressed bytes]` where
/// `total_len` includes the 8-byte header itself.
///
/// Default levels: zlib level 3, zstd level 3.
#[must_use]
pub fn try_compress(
    data: &[u8],
    algorithm: btrfs_disk::items::CompressionType,
) -> Option<Vec<u8>> {
    use btrfs_disk::items::CompressionType;

    if data.is_empty() {
        return None;
    }
    let compressed = match algorithm {
        CompressionType::None | CompressionType::Unknown(_) => return None,
        CompressionType::Zlib => {
            use flate2::{Compression, write::ZlibEncoder};
            use std::io::Write;
            let mut enc = ZlibEncoder::new(Vec::new(), Compression::new(3));
            enc.write_all(data).ok()?;
            enc.finish().ok()?
        }
        CompressionType::Zstd => zstd::bulk::compress(data, 3).ok()?,
        CompressionType::Lzo => {
            let seg = lzokay::compress::compress(data).ok()?;
            let total_len: u32 = (4 + 4 + seg.len()).try_into().ok()?;
            let seg_len: u32 = seg.len().try_into().ok()?;
            let mut buf = Vec::with_capacity(total_len as usize);
            buf.extend_from_slice(&total_len.to_le_bytes());
            buf.extend_from_slice(&seg_len.to_le_bytes());
            buf.extend_from_slice(&seg);
            buf
        }
    };
    if compressed.len() < data.len() {
        Some(compressed)
    } else {
        None
    }
}

/// Attempt to compress `data` for storage in a regular (non-inline)
/// `EXTENT_DATA`.
///
/// For zlib and zstd this delegates to [`try_compress`]: the same
/// compressor output is used inline or out-of-line. For LZO this
/// applies the per-sector framing format
/// `[4B total_len LE] { [4B seg_len LE] [lzo1x bytes] [zero pad] }*`
/// where each input sector is compressed independently and zero-pad
/// is inserted whenever the next 4-byte segment header would cross a
/// sector boundary.
///
/// An early-exit heuristic abandons LZO compression after the first
/// 4 sectors if the framed buffer has already grown past 3 sectors —
/// incompressible inputs lose to the per-sector header overhead, so
/// it is cheaper to bail and let the caller write raw bytes.
///
/// Returns `Some(payload)` only if the result strictly shrinks `data`.
#[must_use]
pub fn try_compress_regular(
    data: &[u8],
    algorithm: btrfs_disk::items::CompressionType,
    sectorsize: u32,
) -> Option<Vec<u8>> {
    use btrfs_disk::items::CompressionType;

    if data.is_empty() {
        return None;
    }
    if !matches!(algorithm, CompressionType::Lzo) {
        return try_compress(data, algorithm);
    }

    let ss = sectorsize as usize;
    let sectors = data.len().div_ceil(ss);

    // Reserve space for the leading 4-byte total_len header; we'll
    // patch it in once we know the final length.
    let mut buf = Vec::with_capacity(data.len());
    buf.extend_from_slice(&[0u8; 4]);

    for i in 0..sectors {
        let start = i * ss;
        let end = (start + ss).min(data.len());
        let seg = lzokay::compress::compress(&data[start..end]).ok()?;
        let seg_len: u32 = seg.len().try_into().ok()?;
        buf.extend_from_slice(&seg_len.to_le_bytes());
        buf.extend_from_slice(&seg);

        // Pad if the next segment's 4-byte header would cross a sector
        // boundary. `pos % ss == 0` means we're already aligned and have
        // a full sector ahead, so no padding.
        let pos = buf.len();
        let sector_rem = ss - (pos % ss);
        if sector_rem < 4 && sector_rem < ss {
            buf.resize(pos + sector_rem, 0);
        }

        // Early-exit heuristic: incompressible data would otherwise grow
        // because of the per-sector overhead. After 4 sectors, if we're
        // already past 3 sectors of output, abandon.
        if i >= 3 && buf.len() > 3 * ss {
            return None;
        }
    }

    let total_len: u32 = buf.len().try_into().ok()?;
    buf[0..4].copy_from_slice(&total_len.to_le_bytes());

    if buf.len() < data.len() {
        Some(buf)
    } else {
        None
    }
}

/// Maximum payload bytes that fit in an inline `EXTENT_DATA` item.
///
/// Derived as `min(nodesize - 147, sectorsize - 1)`. The 147 accounts
/// for the leaf header (`HEADER_SIZE` = 101), the per-item descriptor
/// (`ITEM_SIZE` = 25), and the `FileExtentItem` header (21 bytes). The
/// `sectorsize - 1` cap is btrfs's convention: sector-or-larger files
/// must use a regular extent.
///
/// For default 16K nodesize / 4K sectorsize: 4095 bytes.
#[must_use]
pub const fn max_inline_data_size(sectorsize: u32, nodesize: u32) -> usize {
    let max_item_inline = (nodesize as usize) - HEADER_SIZE - ITEM_SIZE - 21;
    let sector_cap = (sectorsize as usize) - 1;
    if max_item_inline < sector_cap {
        max_item_inline
    } else {
        sector_cap
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn align_up_already_aligned() {
        assert_eq!(align_up(4096, 4096), 4096);
    }

    #[test]
    fn align_up_not_aligned() {
        assert_eq!(align_up(4097, 4096), 8192);
    }

    #[test]
    fn align_up_zero() {
        assert_eq!(align_up(0, 4096), 0);
    }

    #[test]
    fn max_inline_default_nodesize_sectorsize() {
        // 16K nodesize / 4K sectorsize: capped by sectorsize - 1.
        assert_eq!(max_inline_data_size(4096, 16384), 4095);
    }

    #[test]
    fn max_inline_large_sectorsize_capped_by_nodesize() {
        // 4K nodesize / 64K sectorsize: capped by nodesize budget.
        assert_eq!(max_inline_data_size(65536, 4096), 4096 - 147);
    }

    #[test]
    fn try_compress_zlib_compressible() {
        use btrfs_disk::items::CompressionType;
        let data = vec![0x42u8; 4096];
        let compressed = try_compress(&data, CompressionType::Zlib).unwrap();
        assert!(compressed.len() < data.len());
    }

    #[test]
    fn try_compress_zstd_compressible() {
        use btrfs_disk::items::CompressionType;
        let data = vec![0x42u8; 4096];
        let compressed = try_compress(&data, CompressionType::Zstd).unwrap();
        assert!(compressed.len() < data.len());
    }

    #[test]
    fn try_compress_incompressible_returns_none() {
        use btrfs_disk::items::CompressionType;
        // xorshift64 PRNG produces high-entropy bytes that neither
        // zlib nor zstd can shrink.
        let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
        let data: Vec<u8> = (0..4096)
            .map(|_| {
                state ^= state << 13;
                state ^= state >> 7;
                state ^= state << 17;
                state as u8
            })
            .collect();
        assert!(try_compress(&data, CompressionType::Zlib).is_none());
        assert!(try_compress(&data, CompressionType::Zstd).is_none());
    }

    #[test]
    fn try_compress_empty_returns_none() {
        use btrfs_disk::items::CompressionType;
        assert!(try_compress(&[], CompressionType::Zlib).is_none());
        assert!(try_compress(&[], CompressionType::Zstd).is_none());
    }

    #[test]
    fn try_compress_none_unknown_return_none() {
        use btrfs_disk::items::CompressionType;
        let data = vec![0x42u8; 4096];
        assert!(try_compress(&data, CompressionType::None).is_none());
        assert!(try_compress(&data, CompressionType::Unknown(99)).is_none());
    }

    #[test]
    fn try_compress_lzo_inline_compressible() {
        use btrfs_disk::items::CompressionType;
        let data = vec![0x42u8; 4096];
        let buf = try_compress(&data, CompressionType::Lzo).unwrap();
        assert!(buf.len() < data.len());
        // Inline format: [4B total_len LE][4B seg_len LE][lzo bytes]
        let total_len =
            u32::from_le_bytes(buf[0..4].try_into().unwrap()) as usize;
        let seg_len =
            u32::from_le_bytes(buf[4..8].try_into().unwrap()) as usize;
        assert_eq!(total_len, buf.len(), "total_len matches buffer size");
        assert_eq!(seg_len + 8, buf.len(), "seg_len + 8B header == total");
    }

    #[test]
    fn try_compress_regular_lzo_compressible() {
        use btrfs_disk::items::CompressionType;
        // Multi-sector compressible payload triggers per-sector framing.
        let data = vec![0x42u8; 8192];
        let buf =
            try_compress_regular(&data, CompressionType::Lzo, 4096).unwrap();
        assert!(buf.len() < data.len());
        let total_len =
            u32::from_le_bytes(buf[0..4].try_into().unwrap()) as usize;
        assert_eq!(total_len, buf.len());
    }

    #[test]
    fn try_compress_regular_lzo_incompressible_short_circuits() {
        use btrfs_disk::items::CompressionType;
        let mut state: u64 = 0xDEAD_BEEF_CAFE_F00D;
        // 8 sectors of high-entropy data — should trigger early-exit.
        let data: Vec<u8> = (0..32 * 1024)
            .map(|_| {
                state ^= state << 13;
                state ^= state >> 7;
                state ^= state << 17;
                state as u8
            })
            .collect();
        assert!(
            try_compress_regular(&data, CompressionType::Lzo, 4096).is_none()
        );
    }

    #[test]
    fn try_compress_regular_zlib_zstd_match_inline() {
        // Non-LZO algorithms produce identical output for inline and
        // regular paths.
        use btrfs_disk::items::CompressionType;
        let data = vec![0x42u8; 4096];
        for ct in [CompressionType::Zlib, CompressionType::Zstd] {
            let inline_buf = try_compress(&data, ct).unwrap();
            let regular_buf = try_compress_regular(&data, ct, 4096).unwrap();
            assert_eq!(inline_buf, regular_buf, "{ct:?}");
        }
    }
}