Skip to main content

btrfs_disk/
chunk.rs

1//! # Logical-to-physical address mapping for btrfs filesystems
2//!
3//! Btrfs maps logical addresses to physical device offsets through chunk items
4//! stored in the chunk tree. The superblock embeds a small subset of the chunk
5//! tree (the system chunk array) to bootstrap access to the full chunk tree.
6//!
7//! This module provides a `ChunkTreeCache` that resolves logical addresses to
8//! physical offsets, seeded from the `sys_chunk_array` and then populated from
9//! the full chunk tree.
10
11use crate::raw;
12use bytes::Buf;
13use std::{collections::BTreeMap, mem};
14use uuid::Uuid;
15
16fn get_uuid(buf: &mut &[u8]) -> Uuid {
17    let bytes: [u8; 16] = buf[..16].try_into().unwrap();
18    buf.advance(16);
19    Uuid::from_bytes(bytes)
20}
21
22/// A single stripe in a chunk mapping, identifying a physical location on a device.
23#[derive(Debug, Clone)]
24pub struct Stripe {
25    /// Device ID where this stripe resides.
26    pub devid: u64,
27    /// Physical byte offset on the device.
28    pub offset: u64,
29    /// UUID of the device.
30    pub dev_uuid: Uuid,
31}
32
33/// A chunk mapping: maps a range of logical addresses to physical device locations.
34#[derive(Debug, Clone)]
35pub struct ChunkMapping {
36    /// Starting logical byte address of this chunk.
37    pub logical: u64,
38    /// Length of this chunk in bytes.
39    pub length: u64,
40    /// Stripe length for striped profiles (RAID0/10/5/6).
41    pub stripe_len: u64,
42    /// Chunk type flags (DATA/METADATA/SYSTEM + RAID profile).
43    pub chunk_type: u64,
44    /// Number of stripes (device copies/segments).
45    pub num_stripes: u16,
46    /// Sub-stripes for RAID10.
47    pub sub_stripes: u16,
48    /// Physical device locations for each stripe.
49    pub stripes: Vec<Stripe>,
50}
51
52/// RAID profile derived from a chunk's `chunk_type` flags.
53///
54/// `RAID1` covers all mirrored profiles (RAID1, RAID1C3, RAID1C4) since
55/// they share the same routing math (every stripe gets the same bytes).
56/// The number of mirrors is given by `ChunkMapping::num_stripes`.
57#[derive(Debug, Clone, Copy, PartialEq, Eq)]
58pub enum ChunkProfile {
59    /// No replication, no striping. One stripe, written/read whole.
60    Single,
61    /// Two copies on the same device (mostly used for metadata on
62    /// single-device filesystems). Both stripes get the same bytes.
63    Dup,
64    /// Striped across devices, no redundancy. Each row of `stripe_len`
65    /// bytes lands on a single device; consecutive rows round-robin.
66    Raid0,
67    /// Mirrored across N devices (`num_stripes` = 2/3/4 for
68    /// RAID1 / RAID1C3 / RAID1C4). Every stripe gets the same bytes.
69    Raid1,
70    /// Striped mirrors. `num_stripes / sub_stripes` data groups, each
71    /// mirrored `sub_stripes`-ways (always 2 in practice).
72    Raid10,
73    /// Striped with parity (single parity device). Not yet handled.
74    Raid5,
75    /// Striped with double parity. Not yet handled.
76    Raid6,
77}
78
79impl ChunkProfile {
80    /// Decode the RAID profile bits of an on-disk `chunk_type` field.
81    ///
82    /// SINGLE is the absence of any profile bit. The data/metadata/system
83    /// bits are ignored — only the RAID profile bits matter for routing.
84    #[must_use]
85    pub fn from_chunk_type(chunk_type: u64) -> Self {
86        if chunk_type & u64::from(raw::BTRFS_BLOCK_GROUP_RAID0) != 0 {
87            Self::Raid0
88        } else if chunk_type & u64::from(raw::BTRFS_BLOCK_GROUP_RAID10) != 0 {
89            Self::Raid10
90        } else if chunk_type & u64::from(raw::BTRFS_BLOCK_GROUP_RAID5) != 0 {
91            Self::Raid5
92        } else if chunk_type & u64::from(raw::BTRFS_BLOCK_GROUP_RAID6) != 0 {
93            Self::Raid6
94        } else if chunk_type
95            & u64::from(
96                raw::BTRFS_BLOCK_GROUP_RAID1
97                    | raw::BTRFS_BLOCK_GROUP_RAID1C3
98                    | raw::BTRFS_BLOCK_GROUP_RAID1C4,
99            )
100            != 0
101        {
102            Self::Raid1
103        } else if chunk_type & u64::from(raw::BTRFS_BLOCK_GROUP_DUP) != 0 {
104            Self::Dup
105        } else {
106            Self::Single
107        }
108    }
109}
110
111impl ChunkMapping {
112    /// Decode the chunk's RAID profile from its `chunk_type` field.
113    #[must_use]
114    pub fn profile(&self) -> ChunkProfile {
115        ChunkProfile::from_chunk_type(self.chunk_type)
116    }
117}
118
119/// One per-device write or read produced by [`ChunkTreeCache::plan_write`]
120/// or [`ChunkTreeCache::plan_read`].
121///
122/// `buf_offset..buf_offset + len` is the slice of the caller's buffer
123/// that goes to this device at `physical`.
124#[derive(Debug, Clone, PartialEq, Eq)]
125pub struct StripePlacement {
126    /// Device id where this slice goes.
127    pub devid: u64,
128    /// Physical byte offset on the device.
129    pub physical: u64,
130    /// Byte offset within the caller's buffer where this slice starts.
131    pub buf_offset: usize,
132    /// Number of bytes to write (or read).
133    pub len: usize,
134}
135
136/// Result of [`ChunkTreeCache::plan_write`].
137///
138/// Non-parity profiles (SINGLE / DUP / RAID1* / RAID0 / RAID10) produce a
139/// `Plain` plan: a flat list of placements, each a slice of the caller's
140/// buffer to a `(devid, physical)` location. RAID5 and RAID6 produce a
141/// `Parity` plan: per-row descriptors that the executor must read into
142/// scratch buffers, mix with the caller's bytes, compute parity over,
143/// then write data + parity slices to the device columns.
144#[derive(Debug, Clone, PartialEq, Eq)]
145pub enum WritePlan {
146    /// SINGLE / DUP / RAID1* / RAID0 / RAID10 placements. The caller
147    /// just iterates the vec and writes slices of its buffer.
148    Plain(Vec<StripePlacement>),
149    /// RAID5 / RAID6 placements. The caller must run the parity
150    /// executor: preread every data column slot of every touched row,
151    /// overlay caller bytes, compute P (and Q for RAID6), then write
152    /// the overlaid byte ranges and the parity slots.
153    Parity(ParityPlan),
154}
155
156impl WritePlan {
157    /// Convenience for tests and non-parity callers: returns the
158    /// placements if `Plain`.
159    ///
160    /// # Panics
161    ///
162    /// Panics if the plan is `Parity` — used in test code where the
163    /// profile under test should never produce a parity plan.
164    #[cfg(test)]
165    #[must_use]
166    pub fn unwrap_plain(self) -> Vec<StripePlacement> {
167        match self {
168            Self::Plain(p) => p,
169            Self::Parity(_) => panic!("plan_write returned Parity"),
170        }
171    }
172}
173
174/// Per-row write plan for RAID5/RAID6 chunks.
175///
176/// All rows share `stripe_len` (every data column slot is exactly
177/// `stripe_len` bytes; every parity slot is exactly `stripe_len`
178/// bytes). The executor allocates `stripe_len`-sized scratch buffers
179/// per data column per row.
180#[derive(Debug, Clone, PartialEq, Eq)]
181pub struct ParityPlan {
182    /// Bytes per column slot. Same for every row; convenient to carry
183    /// once at the plan level.
184    pub stripe_len: u32,
185    /// One descriptor per physical row touched by the write.
186    pub rows: Vec<ParityRow>,
187}
188
189/// One physical row of a RAID5/RAID6 chunk that the write touches.
190///
191/// `data_columns` lists the data column slots in the row (length
192/// `num_stripes - nparity`). Every entry is a full `stripe_len` slot
193/// on a device; the executor must preread the slot, optionally
194/// overlay caller bytes, then both write the overlaid range back to
195/// the device (if the overlay is non-empty) and use the assembled
196/// slot for parity computation.
197///
198/// `parity_targets` are the parity column slots (1 entry for RAID5,
199/// 2 for RAID6). The executor writes the computed parity bytes to
200/// each target's physical offset.
201#[derive(Debug, Clone, PartialEq, Eq)]
202pub struct ParityRow {
203    /// One per data stripe of the row, in column order (data column 0
204    /// of the row first). Length equals `num_stripes - nparity`.
205    pub data_columns: Vec<ParityDataColumn>,
206    /// Parity column outputs for the row (1 for RAID5, 2 for RAID6).
207    pub parity_targets: Vec<ParityTarget>,
208}
209
210/// One data column slot in a [`ParityRow`].
211#[derive(Debug, Clone, PartialEq, Eq)]
212pub struct ParityDataColumn {
213    /// Device this column lives on.
214    pub devid: u64,
215    /// Physical byte offset of the slot's start on the device. The
216    /// slot always covers `[physical, physical + stripe_len)`.
217    pub physical: u64,
218    /// Byte range from the caller's buffer that overlays the slot, or
219    /// `None` if the caller did not touch this column (parity still
220    /// needs the existing bytes from disk).
221    pub overlay: Option<CallerOverlay>,
222}
223
224/// A range of caller bytes that overlays a data column slot.
225#[derive(Debug, Clone, PartialEq, Eq)]
226pub struct CallerOverlay {
227    /// Offset within the column slot where the overlay starts
228    /// (`0 <= slot_offset < stripe_len`).
229    pub slot_offset: u32,
230    /// Offset in the caller's buffer where the overlay bytes start.
231    pub buf_offset: usize,
232    /// Length of the overlay in bytes
233    /// (`slot_offset + len <= stripe_len`).
234    pub len: u32,
235}
236
237/// One parity column slot to write.
238///
239/// The bytes themselves are computed by the executor — this struct
240/// only carries the destination.
241#[derive(Debug, Clone, PartialEq, Eq)]
242pub struct ParityTarget {
243    /// `P` (XOR) or `Q` (Reed-Solomon).
244    pub kind: ParityKind,
245    /// Device id of the parity column.
246    pub devid: u64,
247    /// Physical byte offset of the slot's start on the device. Length
248    /// is always `stripe_len`.
249    pub physical: u64,
250}
251
252/// Which parity polynomial a [`ParityTarget`] holds.
253#[derive(Debug, Clone, Copy, PartialEq, Eq)]
254pub enum ParityKind {
255    /// XOR of the row's data columns. Used by RAID5 and RAID6.
256    P,
257    /// Reed-Solomon over GF(2^8) with `x^8 + x^4 + x^3 + x^2 + 1`.
258    /// RAID6 only.
259    Q,
260}
261
262/// Cache of chunk tree mappings for resolving logical to physical addresses.
263///
264/// Keyed by logical start address. Uses a `BTreeMap` for efficient range lookups.
265#[derive(Debug, Default)]
266pub struct ChunkTreeCache {
267    inner: BTreeMap<u64, ChunkMapping>,
268}
269
270impl ChunkTreeCache {
271    /// Insert a chunk mapping into the cache.
272    pub fn insert(&mut self, mapping: ChunkMapping) {
273        self.inner.insert(mapping.logical, mapping);
274    }
275
276    /// Look up the chunk mapping that contains the given logical address.
277    #[must_use]
278    pub fn lookup(&self, logical: u64) -> Option<&ChunkMapping> {
279        // Find the entry whose start is <= logical
280        self.inner
281            .range(..=logical)
282            .next_back()
283            .map(|(_, mapping)| mapping)
284            .filter(|mapping| logical < mapping.logical + mapping.length)
285    }
286
287    /// Resolve a logical address to `(devid, physical)` for the first stripe.
288    ///
289    /// For read-only access the first stripe is sufficient on SINGLE, DUP,
290    /// and any mirroring profile. RAID0/5/6/10 striping would need stripe
291    /// index calculation, but for tree blocks (always `nodesize <= stripe_len`)
292    /// the whole block lives in one stripe slot, so this works for the
293    /// common case.
294    ///
295    /// Callers using a multi-device `BlockReader` look up the device handle
296    /// by `devid`; single-device callers ignore it.
297    #[must_use]
298    pub fn resolve(&self, logical: u64) -> Option<(u64, u64)> {
299        let mapping = self.lookup(logical)?;
300        let offset_within_chunk = logical - mapping.logical;
301        let stripe = &mapping.stripes[0];
302        Some((stripe.devid, stripe.offset + offset_within_chunk))
303    }
304
305    /// Resolve a logical address to `(devid, physical)` for every stripe.
306    ///
307    /// For DUP, RAID1, RAID1C3, and RAID1C4, a single logical address maps
308    /// to multiple physical copies. Write operations must update all copies
309    /// to maintain consistency.
310    ///
311    /// **Use [`plan_write`](Self::plan_write) for actual write routing.**
312    /// `resolve_all` ignores the chunk's RAID profile and assumes every
313    /// stripe should receive the same bytes; that is correct for DUP /
314    /// RAID1* but wrong for RAID0 (each row goes to one device only) and
315    /// RAID10 (each row goes to one mirror pair, not all pairs). Kept
316    /// for diagnostics and read-only callers that only need a list of
317    /// stripe locations.
318    #[must_use]
319    pub fn resolve_all(&self, logical: u64) -> Option<Vec<(u64, u64)>> {
320        let mapping = self.lookup(logical)?;
321        let offset_within_chunk = logical - mapping.logical;
322        Some(
323            mapping
324                .stripes
325                .iter()
326                .map(|s| (s.devid, s.offset + offset_within_chunk))
327                .collect(),
328        )
329    }
330
331    /// Plan the per-device writes needed to land `len` bytes at the
332    /// logical address `logical`, accounting for the chunk's RAID
333    /// profile and stripe length.
334    ///
335    /// Returns a [`WritePlan`]: a `Plain` variant (a flat vec of
336    /// [`StripePlacement`]s) for non-parity profiles, and a `Parity`
337    /// variant ([`ParityPlan`]) for RAID5/RAID6.
338    ///
339    /// Per-profile fan-out for a single row of a non-parity profile:
340    ///
341    /// - SINGLE: one placement (column 0).
342    /// - DUP / RAID1 / RAID1C3 / RAID1C4: `num_stripes` placements
343    ///   (every stripe gets the same bytes).
344    /// - RAID0: one placement (column = `stripe_nr % num_stripes`).
345    /// - RAID10: `sub_stripes` placements (the mirror pair for the row).
346    ///
347    /// For RAID5/RAID6 the plan instead names every data column slot of
348    /// every touched physical row plus the rotating parity column(s);
349    /// the caller must run a parity executor that prereads the data
350    /// slots, mixes in caller bytes, computes parity, then writes data
351    /// + parity to the device.
352    ///
353    /// Buffers larger than `stripe_len - stripe_offset` span multiple
354    /// rows; each row's placements are appended in order.
355    ///
356    /// Returns `None` if `logical` is unmapped or if `logical + len`
357    /// exceeds the chunk.
358    #[must_use]
359    pub fn plan_write(&self, logical: u64, len: usize) -> Option<WritePlan> {
360        let mapping = self.lookup(logical)?;
361        match mapping.profile() {
362            ChunkProfile::Raid5 | ChunkProfile::Raid6 => {
363                plan_parity_write(mapping, logical, len).map(WritePlan::Parity)
364            }
365            _ => plan_io(mapping, logical, len, /* read = */ false)
366                .map(WritePlan::Plain),
367        }
368    }
369
370    /// Plan the per-device reads needed to fetch `len` bytes from the
371    /// logical address `logical`. Returns exactly one placement per row
372    /// (the first stripe of each row, or the row's data column for
373    /// RAID5/RAID6) — the caller assembles the bytes in order.
374    ///
375    /// Reads on RAID5/RAID6 ignore parity columns: the data column
376    /// owning each row's bytes is read directly. Degraded reads
377    /// (reconstructing a missing data column from parity) are out of
378    /// scope.
379    ///
380    /// Returns `None` if `logical` is unmapped or if `logical + len`
381    /// exceeds the chunk.
382    #[must_use]
383    pub fn plan_read(
384        &self,
385        logical: u64,
386        len: usize,
387    ) -> Option<Vec<StripePlacement>> {
388        let mapping = self.lookup(logical)?;
389        match mapping.profile() {
390            ChunkProfile::Raid5 | ChunkProfile::Raid6 => {
391                plan_parity_read(mapping, logical, len)
392            }
393            _ => plan_io(mapping, logical, len, /* read = */ true),
394        }
395    }
396
397    /// Return the number of cached chunk mappings.
398    #[must_use]
399    pub fn len(&self) -> usize {
400        self.inner.len()
401    }
402
403    /// Return true if the cache is empty.
404    #[must_use]
405    pub fn is_empty(&self) -> bool {
406        self.inner.is_empty()
407    }
408
409    /// Iterate over all chunk mappings in logical address order.
410    pub fn iter(&self) -> impl Iterator<Item = &ChunkMapping> {
411        self.inner.values()
412    }
413}
414
415/// Compute per-device placements for a logical-range I/O within `mapping`.
416///
417/// The shared core for plain (non-parity) write and read planning:
418/// walks the request row by row, and for each row picks the columns of
419/// `mapping.stripes` that own that row's bytes per the chunk's RAID
420/// profile.
421///
422/// `read` is true for read planning (one placement per row, picking the
423/// first column of the row's mirror group) and false for write planning
424/// (every column of the row's mirror group).
425///
426/// RAID5/RAID6 must be routed via [`plan_parity_write`] /
427/// [`plan_parity_read`] — this function panics on those profiles in
428/// debug builds and returns `None` in release builds.
429fn plan_io(
430    mapping: &ChunkMapping,
431    logical: u64,
432    len: usize,
433    read: bool,
434) -> Option<Vec<StripePlacement>> {
435    if len == 0 {
436        return Some(Vec::new());
437    }
438    // Bounds check: the entire request must fit in the chunk.
439    let end = logical.checked_add(len as u64)?;
440    if end > mapping.logical.checked_add(mapping.length)? {
441        return None;
442    }
443    let profile = mapping.profile();
444    debug_assert!(
445        !matches!(profile, ChunkProfile::Raid5 | ChunkProfile::Raid6),
446        "plan_io does not handle RAID5/RAID6; route via plan_parity_*",
447    );
448    if matches!(profile, ChunkProfile::Raid5 | ChunkProfile::Raid6) {
449        return None;
450    }
451
452    let stripe_len = mapping.stripe_len;
453    debug_assert!(stripe_len > 0, "chunk stripe_len must be non-zero");
454
455    // Depending on the profile, only some columns of `mapping.stripes`
456    // are independently addressable. SINGLE/DUP/RAID1*: 1 column (the
457    // others are mirrors). RAID0: every column carries different rows.
458    // RAID10: sub_stripes columns per group, num_stripes/sub_stripes
459    // groups total.
460    let factor: u64 = match profile {
461        ChunkProfile::Single | ChunkProfile::Dup | ChunkProfile::Raid1 => 1,
462        ChunkProfile::Raid0 => u64::from(mapping.num_stripes),
463        ChunkProfile::Raid10 => {
464            let sub = u64::from(mapping.sub_stripes.max(1));
465            u64::from(mapping.num_stripes) / sub
466        }
467        ChunkProfile::Raid5 | ChunkProfile::Raid6 => unreachable!(),
468    };
469    debug_assert!(factor >= 1, "factor must be >= 1");
470
471    let mut placements: Vec<StripePlacement> = Vec::new();
472    let mut buf_offset: usize = 0;
473    let mut cur = logical - mapping.logical; // offset within chunk
474    let mut remaining = len;
475
476    // Row column buffer reused per row: RAID1C4 (4 mirrors) is the
477    // largest fan-out, so 4 entries is enough.
478    let mut cols: [u16; 4] = [0; 4];
479
480    while remaining > 0 {
481        let stripe_nr = cur / stripe_len;
482        let stripe_offset = cur % stripe_len;
483        // How many bytes of this row are in our request.
484        // `row_bytes` is bounded by `stripe_len` and `remaining` (a
485        // usize) so it always fits in usize.
486        let row_bytes =
487            usize::try_from((stripe_len - stripe_offset).min(remaining as u64))
488                .expect("row_bytes capped by remaining (usize)");
489
490        // Per-row column selection: which columns of mapping.stripes
491        // own this row's bytes.
492        let n_cols = fill_row_columns(profile, mapping, stripe_nr, &mut cols);
493        let cols_to_use = if read { &cols[..1] } else { &cols[..n_cols] };
494
495        // Per-device offset within the device.
496        let per_device_stripe_nr = stripe_nr / factor;
497        let per_device_offset =
498            per_device_stripe_nr * stripe_len + stripe_offset;
499
500        for &col in cols_to_use {
501            let stripe = &mapping.stripes[col as usize];
502            placements.push(StripePlacement {
503                devid: stripe.devid,
504                physical: stripe.offset + per_device_offset,
505                buf_offset,
506                len: row_bytes,
507            });
508        }
509
510        buf_offset += row_bytes;
511        cur += row_bytes as u64;
512        remaining -= row_bytes;
513    }
514
515    Some(placements)
516}
517
518/// Fill `cols` with the column indices of `mapping.stripes` that own
519/// the row at `stripe_nr` for the given profile, and return how many
520/// columns were written. `cols` must have room for at least
521/// `mapping.num_stripes` entries (4 is enough for the largest mirror
522/// fan-out: RAID1C4).
523fn fill_row_columns(
524    profile: ChunkProfile,
525    mapping: &ChunkMapping,
526    stripe_nr: u64,
527    cols: &mut [u16; 4],
528) -> usize {
529    // num_stripes / sub_stripes / column indices all fit in u16 (the
530    // profile fan-out is at most 4 — RAID1C4); the cast asserts here
531    // are documentation that they cannot truncate in practice.
532    match profile {
533        ChunkProfile::Single => {
534            cols[0] = 0;
535            1
536        }
537        ChunkProfile::Dup | ChunkProfile::Raid1 => {
538            let n = usize::from(mapping.num_stripes);
539            debug_assert!(n <= cols.len(), "mirror count {n} exceeds 4");
540            for (i, c) in cols.iter_mut().enumerate().take(n) {
541                *c =
542                    u16::try_from(i).expect("mirror count fits in u16 (max 4)");
543            }
544            n
545        }
546        ChunkProfile::Raid0 => {
547            let col_u64 = stripe_nr % u64::from(mapping.num_stripes);
548            cols[0] = u16::try_from(col_u64)
549                .expect("col bounded by num_stripes (u16)");
550            1
551        }
552        ChunkProfile::Raid10 => {
553            let sub = mapping.sub_stripes.max(1);
554            let factor = mapping.num_stripes / sub;
555            let group_u64 = stripe_nr % u64::from(factor);
556            let group = u16::try_from(group_u64)
557                .expect("group bounded by factor (u16)");
558            let base = group * sub;
559            let n = usize::from(sub);
560            for (s, c) in cols.iter_mut().enumerate().take(n) {
561                *c = base
562                    + u16::try_from(s)
563                        .expect("sub_stripes fits in u16 (max 4)");
564            }
565            n
566        }
567        ChunkProfile::Raid5 | ChunkProfile::Raid6 => {
568            // Caller must filter these out before calling.
569            unreachable!()
570        }
571    }
572}
573
574/// Compute the rotating parity column(s) for a physical row of a
575/// RAID5/RAID6 chunk.
576///
577/// Returns `(p_col, q_col)`. For RAID5, `q_col == p_col` (only one
578/// parity column exists). For RAID6, `p_col` and `q_col` are the two
579/// rotating parity slots.
580///
581/// The rotation matches btrfs's left-symmetric layout: at physical row
582/// `r`, the rightmost parity column is `(num_stripes - 1 - r) mod
583/// num_stripes`, and (for RAID6) the second parity column is one slot
584/// to its left.
585fn parity_columns(num_stripes: u64, nparity: u64, phys_row: u64) -> (u16, u16) {
586    debug_assert!(num_stripes > nparity);
587    let n = num_stripes;
588    let q = (2 * n - 1 - (phys_row % n)) % n;
589    let p = if nparity == 1 {
590        q
591    } else {
592        (2 * n - 2 - (phys_row % n)) % n
593    };
594    (
595        u16::try_from(p).expect("p_col bounded by num_stripes (u16)"),
596        u16::try_from(q).expect("q_col bounded by num_stripes (u16)"),
597    )
598}
599
600/// Walk `0..num_stripes`, skip the parity slot(s), and return the
601/// physical column index of the `data_col_in_row`-th data slot.
602fn nth_data_col(
603    num_stripes: u16,
604    nparity: u64,
605    p_col: u16,
606    q_col: u16,
607    data_col_in_row: u64,
608) -> u16 {
609    let mut idx: u64 = 0;
610    for c in 0..num_stripes {
611        if c == p_col || (nparity == 2 && c == q_col) {
612            continue;
613        }
614        if idx == data_col_in_row {
615            return c;
616        }
617        idx += 1;
618    }
619    panic!("data_col_in_row {data_col_in_row} out of range")
620}
621
622/// Build the per-data-column descriptors for one physical row of a
623/// RAID5/RAID6 plan.
624#[allow(clippy::too_many_arguments)]
625fn build_parity_data_columns(
626    mapping: &ChunkMapping,
627    phys_row: u64,
628    stripe_len: u64,
629    data_per_row: u64,
630    row_logical_start: u64,
631    row_a: u64,
632    row_b: u64,
633    row_buf_base: usize,
634    (p_col, q_col): (u16, u16),
635    nparity: u64,
636) -> Vec<ParityDataColumn> {
637    let mut data_columns =
638        Vec::with_capacity(usize::try_from(data_per_row).unwrap_or(0));
639    for data_idx in 0..data_per_row {
640        let phys_col =
641            nth_data_col(mapping.num_stripes, nparity, p_col, q_col, data_idx);
642        let stripe = &mapping.stripes[phys_col as usize];
643        let physical = stripe.offset + phys_row * stripe_len;
644        let slot_logical_start = row_logical_start + data_idx * stripe_len;
645        let slot_logical_end = slot_logical_start + stripe_len;
646        let lo = row_a.max(slot_logical_start);
647        let hi = row_b.min(slot_logical_end);
648        let overlay = (lo < hi).then(|| {
649            let slot_offset = u32::try_from(lo - slot_logical_start)
650                .expect("slot_offset < stripe_len (u32)");
651            let len_bytes =
652                u32::try_from(hi - lo).expect("overlay len < stripe_len (u32)");
653            let buf_offset = row_buf_base
654                + usize::try_from(lo - row_a)
655                    .expect("overlay buf_offset capped by len");
656            CallerOverlay {
657                slot_offset,
658                buf_offset,
659                len: len_bytes,
660            }
661        });
662        data_columns.push(ParityDataColumn {
663            devid: stripe.devid,
664            physical,
665            overlay,
666        });
667    }
668    data_columns
669}
670
671/// Build the parity column targets (1 for RAID5, 2 for RAID6) for one
672/// physical row.
673fn build_parity_targets(
674    mapping: &ChunkMapping,
675    phys_row: u64,
676    stripe_len: u64,
677    p_col: u16,
678    q_col: u16,
679    nparity: u64,
680) -> Vec<ParityTarget> {
681    let p_stripe = &mapping.stripes[p_col as usize];
682    let mut targets = vec![ParityTarget {
683        kind: ParityKind::P,
684        devid: p_stripe.devid,
685        physical: p_stripe.offset + phys_row * stripe_len,
686    }];
687    if nparity == 2 {
688        let q_stripe = &mapping.stripes[q_col as usize];
689        targets.push(ParityTarget {
690            kind: ParityKind::Q,
691            devid: q_stripe.devid,
692            physical: q_stripe.offset + phys_row * stripe_len,
693        });
694    }
695    targets
696}
697
698/// Plan a write to a RAID5 or RAID6 chunk.
699///
700/// Walks the physical rows touched by the request, and for each row
701/// builds:
702/// - one [`ParityDataColumn`] per data column, with the optional
703///   caller-byte overlay describing what (if anything) the caller is
704///   writing into that column slot;
705/// - one [`ParityTarget`] per parity column (1 for RAID5, 2 for
706///   RAID6).
707///
708/// The executor must preread every data column slot of every touched
709/// row to compute parity (since even single-tree-block writes only
710/// cover a fraction of one data column slot, the rest of the row is
711/// untouched but still feeds parity).
712fn plan_parity_write(
713    mapping: &ChunkMapping,
714    logical: u64,
715    len: usize,
716) -> Option<ParityPlan> {
717    let nparity: u64 = match mapping.profile() {
718        ChunkProfile::Raid5 => 1,
719        ChunkProfile::Raid6 => 2,
720        _ => unreachable!("plan_parity_write called for non-RAID5/6 profile"),
721    };
722    let n = u64::from(mapping.num_stripes);
723    let stripe_len = mapping.stripe_len;
724    debug_assert!(stripe_len > 0, "chunk stripe_len must be non-zero");
725    debug_assert!(n > nparity, "RAID5/6 needs more stripes than parity");
726    let stripe_len_u32 = u32::try_from(stripe_len).ok()?;
727
728    if len == 0 {
729        return Some(ParityPlan {
730            stripe_len: stripe_len_u32,
731            rows: Vec::new(),
732        });
733    }
734
735    let end = logical.checked_add(len as u64)?;
736    if end > mapping.logical.checked_add(mapping.length)? {
737        return None;
738    }
739
740    let data_per_row = n - nparity;
741    let logical_per_phys_row = data_per_row * stripe_len;
742    let chunk_off_start = logical - mapping.logical;
743    let chunk_off_end = end - mapping.logical;
744
745    let phys_row_start = chunk_off_start / logical_per_phys_row;
746    let phys_row_end = (chunk_off_end - 1) / logical_per_phys_row;
747
748    let mut rows = Vec::with_capacity(
749        usize::try_from(phys_row_end - phys_row_start + 1)
750            .expect("phys_row count fits in usize"),
751    );
752
753    for phys_row in phys_row_start..=phys_row_end {
754        let row_logical_start = phys_row * logical_per_phys_row;
755        let row_logical_end = row_logical_start + logical_per_phys_row;
756        let row_a = chunk_off_start.max(row_logical_start);
757        let row_b = chunk_off_end.min(row_logical_end);
758        debug_assert!(row_a < row_b, "non-empty row coverage");
759        let row_buf_base = usize::try_from(row_a - chunk_off_start)
760            .expect("row_buf_base capped by len (usize)");
761        let (p_col, q_col) = parity_columns(n, nparity, phys_row);
762        let data_columns = build_parity_data_columns(
763            mapping,
764            phys_row,
765            stripe_len,
766            data_per_row,
767            row_logical_start,
768            row_a,
769            row_b,
770            row_buf_base,
771            (p_col, q_col),
772            nparity,
773        );
774        let parity_targets = build_parity_targets(
775            mapping, phys_row, stripe_len, p_col, q_col, nparity,
776        );
777        rows.push(ParityRow {
778            data_columns,
779            parity_targets,
780        });
781    }
782
783    Some(ParityPlan {
784        stripe_len: stripe_len_u32,
785        rows,
786    })
787}
788
789/// Plan a read from a RAID5 or RAID6 chunk.
790///
791/// Same shape as [`plan_io`] for RAID0: one placement per row, picking
792/// the data column that owns the row's bytes (skipping parity
793/// columns). Degraded reads (data column missing -> reconstruct from
794/// parity) are out of scope.
795fn plan_parity_read(
796    mapping: &ChunkMapping,
797    logical: u64,
798    len: usize,
799) -> Option<Vec<StripePlacement>> {
800    let nparity: u64 = match mapping.profile() {
801        ChunkProfile::Raid5 => 1,
802        ChunkProfile::Raid6 => 2,
803        _ => unreachable!("plan_parity_read called for non-RAID5/6 profile"),
804    };
805    let n = u64::from(mapping.num_stripes);
806    let stripe_len = mapping.stripe_len;
807    debug_assert!(stripe_len > 0, "chunk stripe_len must be non-zero");
808    debug_assert!(n > nparity, "RAID5/6 needs more stripes than parity");
809
810    if len == 0 {
811        return Some(Vec::new());
812    }
813
814    let end = logical.checked_add(len as u64)?;
815    if end > mapping.logical.checked_add(mapping.length)? {
816        return None;
817    }
818
819    let data_per_row = n - nparity;
820    let mut placements = Vec::new();
821    let mut buf_offset: usize = 0;
822    let mut cur = logical - mapping.logical;
823    let mut remaining = len;
824
825    while remaining > 0 {
826        let stripe_nr = cur / stripe_len;
827        let stripe_offset = cur % stripe_len;
828        let row_bytes =
829            usize::try_from((stripe_len - stripe_offset).min(remaining as u64))
830                .expect("row_bytes capped by remaining (usize)");
831
832        let phys_row = stripe_nr / data_per_row;
833        let data_col_in_row = stripe_nr % data_per_row;
834        let (p_col, q_col) = parity_columns(n, nparity, phys_row);
835        let phys_col = nth_data_col(
836            mapping.num_stripes,
837            nparity,
838            p_col,
839            q_col,
840            data_col_in_row,
841        );
842
843        let stripe = &mapping.stripes[phys_col as usize];
844        let per_device_offset = phys_row * stripe_len + stripe_offset;
845        placements.push(StripePlacement {
846            devid: stripe.devid,
847            physical: stripe.offset + per_device_offset,
848            buf_offset,
849            len: row_bytes,
850        });
851
852        buf_offset += row_bytes;
853        cur += row_bytes as u64;
854        remaining -= row_bytes;
855    }
856
857    Some(placements)
858}
859
860/// Parse a chunk item (`btrfs_chunk` + stripes) from a raw byte buffer.
861///
862/// Returns the chunk mapping and the total number of bytes consumed.
863/// `logical` is the logical start address from the key's offset field.
864#[must_use]
865pub fn parse_chunk_item(
866    buf: &[u8],
867    logical: u64,
868) -> Option<(ChunkMapping, usize)> {
869    let chunk_base_size = mem::offset_of!(raw::btrfs_chunk, stripe);
870    let stripe_size = mem::size_of::<raw::btrfs_stripe>();
871
872    if buf.len() < chunk_base_size {
873        return None;
874    }
875
876    let mut b = buf;
877    let length = b.get_u64_le();
878    b.advance(8); // owner
879    let stripe_len = b.get_u64_le();
880    let chunk_type = b.get_u64_le();
881    b.advance(12); // io_align(4) + io_width(4) + sector_size(4)
882    let num_stripes = b.get_u16_le();
883    let sub_stripes = b.get_u16_le();
884
885    let total_size = chunk_base_size + num_stripes as usize * stripe_size;
886    if buf.len() < total_size {
887        return None;
888    }
889
890    let mut stripes = Vec::with_capacity(num_stripes as usize);
891    let mut b = &buf[chunk_base_size..];
892    for _ in 0..num_stripes as usize {
893        let devid = b.get_u64_le();
894        let offset = b.get_u64_le();
895        let dev_uuid = get_uuid(&mut b);
896        stripes.push(Stripe {
897            devid,
898            offset,
899            dev_uuid,
900        });
901    }
902
903    let mapping = ChunkMapping {
904        logical,
905        length,
906        stripe_len,
907        chunk_type,
908        num_stripes,
909        sub_stripes,
910        stripes,
911    };
912
913    Some((mapping, total_size))
914}
915
916/// Seed a `ChunkTreeCache` from the superblock's `sys_chunk_array`.
917///
918/// The `sys_chunk_array` contains a subset of chunk items needed to bootstrap
919/// access to the full chunk tree (system profile chunks).
920#[must_use]
921pub fn seed_from_sys_chunk_array(array: &[u8], size: u32) -> ChunkTreeCache {
922    let array = &array[..size as usize];
923    let mut cache = ChunkTreeCache::default();
924
925    let disk_key_size = mem::size_of::<raw::btrfs_disk_key>();
926    let mut offset = 0usize;
927
928    while offset + disk_key_size <= array.len() {
929        let mut b = &array[offset + 9..];
930        let key_offset = b.get_u64_le();
931        offset += disk_key_size;
932
933        if let Some((mapping, consumed)) =
934            parse_chunk_item(&array[offset..], key_offset)
935        {
936            cache.insert(mapping);
937            offset += consumed;
938        } else {
939            break;
940        }
941    }
942
943    cache
944}
945
946/// Serialize a [`ChunkMapping`] into the on-disk `btrfs_chunk` byte
947/// layout (48-byte fixed header + `num_stripes * 32`-byte stripes).
948///
949/// `sector_size` is written into the chunk's `io_align`, `io_width`,
950/// and `sector_size` fields (all three are conventionally equal to the
951/// filesystem sector size).
952#[must_use]
953pub fn chunk_item_bytes(mapping: &ChunkMapping, sector_size: u32) -> Vec<u8> {
954    use bytes::BufMut;
955
956    let chunk_base_size = mem::offset_of!(raw::btrfs_chunk, stripe);
957    let stripe_size = mem::size_of::<raw::btrfs_stripe>();
958    let total = chunk_base_size + mapping.num_stripes as usize * stripe_size;
959    let mut buf: Vec<u8> = Vec::with_capacity(total);
960
961    buf.put_u64_le(mapping.length);
962    // Owner is always EXTENT_TREE objectid (2) for chunk records.
963    buf.put_u64_le(u64::from(raw::BTRFS_EXTENT_TREE_OBJECTID));
964    buf.put_u64_le(mapping.stripe_len);
965    buf.put_u64_le(mapping.chunk_type);
966    buf.put_u32_le(sector_size); // io_align
967    buf.put_u32_le(sector_size); // io_width
968    buf.put_u32_le(sector_size); // sector_size
969    buf.put_u16_le(mapping.num_stripes);
970    buf.put_u16_le(mapping.sub_stripes);
971    debug_assert_eq!(buf.len(), chunk_base_size);
972
973    for stripe in &mapping.stripes {
974        buf.put_u64_le(stripe.devid);
975        buf.put_u64_le(stripe.offset);
976        buf.extend_from_slice(stripe.dev_uuid.as_bytes());
977    }
978    debug_assert_eq!(buf.len(), total);
979    buf
980}
981
982/// Walk the superblock's `sys_chunk_array` and return `true` if it
983/// already contains a record whose `disk_key.offset` matches `bg_start`
984/// (i.e. the system chunk starting at that logical address is already
985/// part of the bootstrap snippet).
986#[must_use]
987pub fn sys_chunk_array_contains(
988    array: &[u8],
989    size: u32,
990    bg_start: u64,
991) -> bool {
992    let array = &array[..size as usize];
993    let disk_key_size = mem::size_of::<raw::btrfs_disk_key>();
994    let mut offset = 0usize;
995    while offset + disk_key_size <= array.len() {
996        // disk_key layout: u64 objectid | u8 type | u64 offset.
997        let mut b = &array[offset + 9..];
998        let key_offset = b.get_u64_le();
999        offset += disk_key_size;
1000        if key_offset == bg_start {
1001            return true;
1002        }
1003        let Some((_, consumed)) =
1004            parse_chunk_item(&array[offset..], key_offset)
1005        else {
1006            return false;
1007        };
1008        offset += consumed;
1009    }
1010    false
1011}
1012
1013/// Append a single `(disk_key, btrfs_chunk)` record to the
1014/// superblock's `sys_chunk_array` byte buffer.
1015///
1016/// Writes the 17-byte `btrfs_disk_key` followed by `chunk_bytes`
1017/// (already serialized via [`chunk_item_bytes`]) starting at offset
1018/// `*size`. On success, `*size` is bumped by the record size and
1019/// `Ok(new_size)` is returned. Returns `Err` if the record would
1020/// overflow the 2048-byte `sys_chunk_array`.
1021///
1022/// `bg_start` is the chunk's logical start address; it becomes the
1023/// `offset` field of the `BTRFS_FIRST_CHUNK_TREE_OBJECTID / CHUNK_ITEM`
1024/// disk key.
1025///
1026/// # Errors
1027///
1028/// Returns an error if the record does not fit in the remaining
1029/// `sys_chunk_array` space.
1030///
1031/// # Panics
1032///
1033/// Panics in debug builds if the new array size cannot be represented
1034/// in a `u32`. In practice this never happens because callers cap the
1035/// buffer at 2048 bytes.
1036pub fn sys_chunk_array_append(
1037    array: &mut [u8],
1038    size: &mut u32,
1039    bg_start: u64,
1040    chunk_bytes: &[u8],
1041) -> Result<u32, &'static str> {
1042    use bytes::BufMut;
1043
1044    let disk_key_size = mem::size_of::<raw::btrfs_disk_key>();
1045    let record_size = disk_key_size + chunk_bytes.len();
1046    let cur = *size as usize;
1047    if cur + record_size > array.len() {
1048        return Err("sys_chunk_array overflow");
1049    }
1050
1051    // disk_key: objectid=BTRFS_FIRST_CHUNK_TREE_OBJECTID(256),
1052    //           type=BTRFS_CHUNK_ITEM_KEY(228),
1053    //           offset=bg_start.
1054    #[allow(clippy::cast_possible_truncation)]
1055    let chunk_item_type = raw::BTRFS_CHUNK_ITEM_KEY as u8;
1056    let mut header = [0u8; 17];
1057    {
1058        let mut w = &mut header[..];
1059        w.put_u64_le(u64::from(raw::BTRFS_FIRST_CHUNK_TREE_OBJECTID));
1060        w.put_u8(chunk_item_type);
1061        w.put_u64_le(bg_start);
1062    }
1063    array[cur..cur + 17].copy_from_slice(&header);
1064    array[cur + 17..cur + record_size].copy_from_slice(chunk_bytes);
1065
1066    let new_size = u32::try_from(cur + record_size).unwrap();
1067    *size = new_size;
1068    Ok(new_size)
1069}
1070
1071#[cfg(test)]
1072mod tests {
1073    use super::*;
1074
1075    fn make_mapping(logical: u64, length: u64, physical: u64) -> ChunkMapping {
1076        ChunkMapping {
1077            logical,
1078            length,
1079            stripe_len: 65536,
1080            chunk_type: 0,
1081            num_stripes: 1,
1082            sub_stripes: 0,
1083            stripes: vec![Stripe {
1084                devid: 1,
1085                offset: physical,
1086                dev_uuid: Uuid::nil(),
1087            }],
1088        }
1089    }
1090
1091    /// Build a chunk mapping with arbitrary stripes. Each entry is
1092    /// `(devid, physical_offset)`.
1093    fn make_multi_stripe_mapping(
1094        logical: u64,
1095        length: u64,
1096        stripes: &[(u64, u64)],
1097    ) -> ChunkMapping {
1098        ChunkMapping {
1099            logical,
1100            length,
1101            stripe_len: 65536,
1102            chunk_type: 0,
1103            num_stripes: stripes.len() as u16,
1104            sub_stripes: 0,
1105            stripes: stripes
1106                .iter()
1107                .map(|&(devid, offset)| Stripe {
1108                    devid,
1109                    offset,
1110                    dev_uuid: Uuid::nil(),
1111                })
1112                .collect(),
1113        }
1114    }
1115
1116    #[test]
1117    fn empty_cache() {
1118        let cache = ChunkTreeCache::default();
1119        assert!(cache.is_empty());
1120        assert_eq!(cache.resolve(0), None);
1121    }
1122
1123    #[test]
1124    fn single_mapping() {
1125        let mut cache = ChunkTreeCache::default();
1126        cache.insert(make_mapping(1000, 500, 2000));
1127        assert_eq!(cache.len(), 1);
1128
1129        assert_eq!(cache.resolve(1000), Some((1, 2000)));
1130        assert_eq!(cache.resolve(1100), Some((1, 2100)));
1131        assert_eq!(cache.resolve(1499), Some((1, 2499)));
1132        assert_eq!(cache.resolve(1500), None); // past end
1133        assert_eq!(cache.resolve(999), None); // before start
1134    }
1135
1136    #[test]
1137    fn multiple_mappings() {
1138        let mut cache = ChunkTreeCache::default();
1139        cache.insert(make_mapping(0, 1000, 5000));
1140        cache.insert(make_mapping(1000, 1000, 6000));
1141        cache.insert(make_mapping(5000, 2000, 10000));
1142
1143        assert_eq!(cache.resolve(0), Some((1, 5000)));
1144        assert_eq!(cache.resolve(500), Some((1, 5500)));
1145        assert_eq!(cache.resolve(1000), Some((1, 6000)));
1146        assert_eq!(cache.resolve(1999), Some((1, 6999)));
1147        assert_eq!(cache.resolve(2000), None); // gap
1148        assert_eq!(cache.resolve(5000), Some((1, 10000)));
1149        assert_eq!(cache.resolve(6999), Some((1, 11999)));
1150        assert_eq!(cache.resolve(7000), None);
1151    }
1152
1153    #[test]
1154    fn lookup_returns_mapping() {
1155        let mut cache = ChunkTreeCache::default();
1156        cache.insert(make_mapping(1000, 500, 2000));
1157
1158        let m = cache.lookup(1100).unwrap();
1159        assert_eq!(m.logical, 1000);
1160        assert_eq!(m.length, 500);
1161        assert!(cache.lookup(500).is_none());
1162    }
1163
1164    #[test]
1165    fn resolve_returns_first_stripe_only() {
1166        // For a multi-stripe mapping, the single-result `resolve` always
1167        // picks stripe[0]'s (devid, physical). Useful for read paths
1168        // where any mirror is fine; write paths use `resolve_all`.
1169        let mut cache = ChunkTreeCache::default();
1170        cache.insert(make_multi_stripe_mapping(
1171            1000,
1172            500,
1173            &[(1, 2000), (2, 9000)],
1174        ));
1175        assert_eq!(cache.resolve(1000), Some((1, 2000)));
1176        assert_eq!(cache.resolve(1100), Some((1, 2100)));
1177        assert_eq!(cache.resolve(1499), Some((1, 2499)));
1178    }
1179
1180    #[test]
1181    fn resolve_all_dup_returns_two_offsets_same_devid() {
1182        // DUP profile: 2 stripes, both on devid 1, at distinct physical
1183        // offsets. The on-device write path writes to both copies via
1184        // the same handle, but the cache must report both placements.
1185        let mut cache = ChunkTreeCache::default();
1186        cache.insert(make_multi_stripe_mapping(
1187            1000,
1188            500,
1189            &[(1, 2000), (1, 50000)],
1190        ));
1191        assert_eq!(cache.resolve_all(1000), Some(vec![(1, 2000), (1, 50000)]),);
1192        // Within-chunk offset propagates to every stripe's physical.
1193        assert_eq!(cache.resolve_all(1100), Some(vec![(1, 2100), (1, 50100)]),);
1194    }
1195
1196    #[test]
1197    fn resolve_all_raid1_returns_two_devids() {
1198        // RAID1: stripes on different devices. `write_block` must
1199        // route each placement to its own device handle.
1200        let mut cache = ChunkTreeCache::default();
1201        cache.insert(make_multi_stripe_mapping(
1202            1000,
1203            500,
1204            &[(1, 2000), (2, 9000)],
1205        ));
1206        assert_eq!(cache.resolve_all(1000), Some(vec![(1, 2000), (2, 9000)]),);
1207        assert_eq!(cache.resolve_all(1250), Some(vec![(1, 2250), (2, 9250)]),);
1208    }
1209
1210    #[test]
1211    fn resolve_all_raid1c3_returns_three_stripes() {
1212        let mut cache = ChunkTreeCache::default();
1213        cache.insert(make_multi_stripe_mapping(
1214            1000,
1215            500,
1216            &[(1, 2000), (2, 9000), (3, 0x10_0000)],
1217        ));
1218        let placements = cache.resolve_all(1100).unwrap();
1219        assert_eq!(placements.len(), 3);
1220        assert_eq!(placements[0], (1, 2100));
1221        assert_eq!(placements[1], (2, 9100));
1222        assert_eq!(placements[2], (3, 0x10_0000 + 100));
1223    }
1224
1225    #[test]
1226    fn resolve_all_raid1c4_returns_four_stripes() {
1227        let mut cache = ChunkTreeCache::default();
1228        cache.insert(make_multi_stripe_mapping(
1229            1000,
1230            500,
1231            &[(1, 2000), (2, 9000), (3, 0x10_0000), (4, 0x20_0000)],
1232        ));
1233        let placements = cache.resolve_all(1100).unwrap();
1234        assert_eq!(placements.len(), 4);
1235        assert_eq!(placements[0], (1, 2100));
1236        assert_eq!(placements[3], (4, 0x20_0000 + 100));
1237    }
1238
1239    #[test]
1240    fn resolve_all_returns_none_outside_chunks() {
1241        let mut cache = ChunkTreeCache::default();
1242        cache.insert(make_multi_stripe_mapping(
1243            1000,
1244            500,
1245            &[(1, 2000), (2, 9000)],
1246        ));
1247        assert_eq!(cache.resolve_all(999), None);
1248        assert_eq!(cache.resolve_all(1500), None);
1249    }
1250
1251    #[test]
1252    fn resolve_all_with_non_dense_devids() {
1253        // btrfs allows device removal that leaves devid gaps. Stripes
1254        // on devids {1, 5} should resolve correctly.
1255        let mut cache = ChunkTreeCache::default();
1256        cache.insert(make_multi_stripe_mapping(
1257            0,
1258            1000,
1259            &[(1, 100), (5, 999_000)],
1260        ));
1261        assert_eq!(cache.resolve_all(42), Some(vec![(1, 142), (5, 999_042)]),);
1262    }
1263
1264    #[test]
1265    fn seed_from_empty_array() {
1266        let array = [0u8; 2048];
1267        let cache = seed_from_sys_chunk_array(&array, 0);
1268        assert!(cache.is_empty());
1269    }
1270
1271    #[test]
1272    fn parse_chunk_item_basic() {
1273        let chunk_base_size = mem::offset_of!(raw::btrfs_chunk, stripe);
1274        let stripe_size = mem::size_of::<raw::btrfs_stripe>();
1275        let total = chunk_base_size + stripe_size;
1276        let mut buf = vec![0u8; total];
1277
1278        // length
1279        buf[0..8].copy_from_slice(&1000u64.to_le_bytes());
1280        // owner
1281        buf[8..16].copy_from_slice(&2u64.to_le_bytes());
1282        // stripe_len
1283        buf[16..24].copy_from_slice(&65536u64.to_le_bytes());
1284        // type
1285        buf[24..32].copy_from_slice(&1u64.to_le_bytes());
1286        // num_stripes
1287        buf[44..46].copy_from_slice(&1u16.to_le_bytes());
1288
1289        // stripe: devid=1, offset=5000
1290        buf[chunk_base_size..chunk_base_size + 8]
1291            .copy_from_slice(&1u64.to_le_bytes());
1292        buf[chunk_base_size + 8..chunk_base_size + 16]
1293            .copy_from_slice(&5000u64.to_le_bytes());
1294
1295        let (mapping, consumed) = parse_chunk_item(&buf, 0).unwrap();
1296        assert_eq!(consumed, total);
1297        assert_eq!(mapping.logical, 0);
1298        assert_eq!(mapping.length, 1000);
1299        assert_eq!(mapping.num_stripes, 1);
1300        assert_eq!(mapping.stripes[0].devid, 1);
1301        assert_eq!(mapping.stripes[0].offset, 5000);
1302    }
1303
1304    fn sample_mapping(
1305        logical: u64,
1306        length: u64,
1307        physical: u64,
1308    ) -> ChunkMapping {
1309        ChunkMapping {
1310            logical,
1311            length,
1312            stripe_len: 65536,
1313            // SYSTEM | SINGLE
1314            chunk_type: u64::from(raw::BTRFS_BLOCK_GROUP_SYSTEM),
1315            num_stripes: 1,
1316            sub_stripes: 1,
1317            stripes: vec![Stripe {
1318                devid: 1,
1319                offset: physical,
1320                dev_uuid: Uuid::from_bytes([0xAB; 16]),
1321            }],
1322        }
1323    }
1324
1325    #[test]
1326    fn chunk_item_bytes_round_trips_via_parser() {
1327        let m = sample_mapping(0x100_0000, 0x40_0000, 0x200_0000);
1328        let bytes = chunk_item_bytes(&m, 4096);
1329        let (parsed, consumed) = parse_chunk_item(&bytes, m.logical).unwrap();
1330        assert_eq!(consumed, bytes.len());
1331        assert_eq!(parsed.logical, m.logical);
1332        assert_eq!(parsed.length, m.length);
1333        assert_eq!(parsed.stripe_len, m.stripe_len);
1334        assert_eq!(parsed.chunk_type, m.chunk_type);
1335        assert_eq!(parsed.num_stripes, 1);
1336        assert_eq!(parsed.sub_stripes, 1);
1337        assert_eq!(parsed.stripes[0].devid, 1);
1338        assert_eq!(parsed.stripes[0].offset, 0x200_0000);
1339        assert_eq!(parsed.stripes[0].dev_uuid, m.stripes[0].dev_uuid);
1340    }
1341
1342    #[test]
1343    fn sys_chunk_array_append_then_contains_and_seed() {
1344        let mut buf = [0u8; 2048];
1345        let mut size: u32 = 0;
1346        let m1 = sample_mapping(0x100_0000, 0x40_0000, 0x200_0000);
1347        let m2 = sample_mapping(0x500_0000, 0x40_0000, 0x600_0000);
1348
1349        let bytes1 = chunk_item_bytes(&m1, 4096);
1350        sys_chunk_array_append(&mut buf, &mut size, m1.logical, &bytes1)
1351            .unwrap();
1352        assert!(sys_chunk_array_contains(&buf, size, m1.logical));
1353        assert!(!sys_chunk_array_contains(&buf, size, m2.logical));
1354
1355        let bytes2 = chunk_item_bytes(&m2, 4096);
1356        sys_chunk_array_append(&mut buf, &mut size, m2.logical, &bytes2)
1357            .unwrap();
1358        assert!(sys_chunk_array_contains(&buf, size, m2.logical));
1359
1360        // Seeding from the array should yield both mappings.
1361        let cache = seed_from_sys_chunk_array(&buf, size);
1362        assert_eq!(cache.len(), 2);
1363        assert!(cache.lookup(m1.logical).is_some());
1364        assert!(cache.lookup(m2.logical).is_some());
1365    }
1366
1367    #[test]
1368    fn sys_chunk_array_append_overflow() {
1369        // Tiny array that fits exactly one record (97 bytes for a
1370        // single-stripe chunk).
1371        let mut buf = [0u8; 100];
1372        let mut size: u32 = 0;
1373        let m = sample_mapping(0, 0x40_0000, 0);
1374        let bytes = chunk_item_bytes(&m, 4096);
1375        sys_chunk_array_append(&mut buf, &mut size, m.logical, &bytes).unwrap();
1376        // Second append must fail.
1377        let m2 = sample_mapping(0x100_0000, 0x40_0000, 0x200_0000);
1378        let bytes2 = chunk_item_bytes(&m2, 4096);
1379        assert!(
1380            sys_chunk_array_append(&mut buf, &mut size, m2.logical, &bytes2)
1381                .is_err()
1382        );
1383    }
1384
1385    #[test]
1386    fn parse_chunk_item_too_short() {
1387        let buf = [0u8; 10];
1388        assert!(parse_chunk_item(&buf, 0).is_none());
1389    }
1390
1391    // --- ChunkProfile decoding ---
1392
1393    #[test]
1394    fn profile_from_chunk_type_basic() {
1395        use ChunkProfile::*;
1396        // SINGLE: no profile bit set (only DATA/SYSTEM/METADATA bits).
1397        assert_eq!(
1398            ChunkProfile::from_chunk_type(u64::from(
1399                raw::BTRFS_BLOCK_GROUP_DATA
1400            )),
1401            Single
1402        );
1403        // DUP, RAID0, RAID1, RAID10, RAID5, RAID6.
1404        let cases = [
1405            (raw::BTRFS_BLOCK_GROUP_DUP, Dup),
1406            (raw::BTRFS_BLOCK_GROUP_RAID0, Raid0),
1407            (raw::BTRFS_BLOCK_GROUP_RAID1, Raid1),
1408            (raw::BTRFS_BLOCK_GROUP_RAID10, Raid10),
1409            (raw::BTRFS_BLOCK_GROUP_RAID5, Raid5),
1410            (raw::BTRFS_BLOCK_GROUP_RAID6, Raid6),
1411        ];
1412        for (bit, expected) in cases {
1413            let ct = u64::from(bit) | u64::from(raw::BTRFS_BLOCK_GROUP_DATA);
1414            assert_eq!(ChunkProfile::from_chunk_type(ct), expected);
1415        }
1416    }
1417
1418    #[test]
1419    fn profile_from_chunk_type_raid1c3_and_c4() {
1420        let c3 = u64::from(raw::BTRFS_BLOCK_GROUP_RAID1C3);
1421        let c4 = u64::from(raw::BTRFS_BLOCK_GROUP_RAID1C4);
1422        assert_eq!(ChunkProfile::from_chunk_type(c3), ChunkProfile::Raid1);
1423        assert_eq!(ChunkProfile::from_chunk_type(c4), ChunkProfile::Raid1);
1424    }
1425
1426    // --- plan_write / plan_read shared helpers ---
1427
1428    /// Build a chunk mapping at logical 0 with the given profile and
1429    /// stripe layout.
1430    fn make_chunk(
1431        chunk_type_bit: u32,
1432        num_stripes: u16,
1433        sub_stripes: u16,
1434        stripe_len: u64,
1435        length: u64,
1436        stripes: &[(u64, u64)],
1437    ) -> ChunkMapping {
1438        ChunkMapping {
1439            logical: 0,
1440            length,
1441            stripe_len,
1442            chunk_type: u64::from(chunk_type_bit)
1443                | u64::from(raw::BTRFS_BLOCK_GROUP_DATA),
1444            num_stripes,
1445            sub_stripes,
1446            stripes: stripes
1447                .iter()
1448                .map(|&(devid, offset)| Stripe {
1449                    devid,
1450                    offset,
1451                    dev_uuid: Uuid::nil(),
1452                })
1453                .collect(),
1454        }
1455    }
1456
1457    fn cache_with(mapping: ChunkMapping) -> ChunkTreeCache {
1458        let mut c = ChunkTreeCache::default();
1459        c.insert(mapping);
1460        c
1461    }
1462
1463    // --- SINGLE ---
1464
1465    #[test]
1466    fn plan_write_single_one_row() {
1467        let m = make_chunk(0, 1, 1, 65536, 1 << 20, &[(1, 0x1000)]);
1468        let cache = cache_with(m);
1469        let placements = cache.plan_write(0, 4096).unwrap().unwrap_plain();
1470        assert_eq!(
1471            placements,
1472            vec![StripePlacement {
1473                devid: 1,
1474                physical: 0x1000,
1475                buf_offset: 0,
1476                len: 4096,
1477            }]
1478        );
1479    }
1480
1481    #[test]
1482    fn plan_write_single_spans_multiple_rows() {
1483        // SINGLE with a 1 MiB chunk and stripe_len 64 KiB. A 96 KiB
1484        // write starting at offset 32 KiB spans two rows. Both go to
1485        // the same column, but the per-device offset advances as the
1486        // row index increases.
1487        let m = make_chunk(0, 1, 1, 65536, 1 << 20, &[(1, 0x10000)]);
1488        let cache = cache_with(m);
1489        let placements = cache
1490            .plan_write(32 * 1024, 96 * 1024)
1491            .unwrap()
1492            .unwrap_plain();
1493        assert_eq!(placements.len(), 2);
1494        // Row 0: offset 32K, 32K bytes (rest of stripe 0).
1495        assert_eq!(placements[0].devid, 1);
1496        assert_eq!(placements[0].physical, 0x10000 + 32 * 1024);
1497        assert_eq!(placements[0].buf_offset, 0);
1498        assert_eq!(placements[0].len, 32 * 1024);
1499        // Row 1: 64K bytes, starting at the next stripe (stripe_nr=1, factor=1
1500        // so per_device_stripe_nr=1; physical = base + 64K).
1501        assert_eq!(placements[1].devid, 1);
1502        assert_eq!(placements[1].physical, 0x10000 + 65536);
1503        assert_eq!(placements[1].buf_offset, 32 * 1024);
1504        assert_eq!(placements[1].len, 64 * 1024);
1505    }
1506
1507    // --- DUP / RAID1 mirroring ---
1508
1509    #[test]
1510    fn plan_write_dup_writes_both_copies_same_buf_slice() {
1511        // DUP: 2 stripes both on devid 1 at distinct physicals. Same
1512        // buf_offset/len for each placement (identical bytes).
1513        let m = make_chunk(
1514            raw::BTRFS_BLOCK_GROUP_DUP,
1515            2,
1516            1,
1517            65536,
1518            1 << 20,
1519            &[(1, 0x1000), (1, 0x2_0000)],
1520        );
1521        let cache = cache_with(m);
1522        let placements = cache.plan_write(4096, 16384).unwrap().unwrap_plain();
1523        assert_eq!(placements.len(), 2);
1524        assert_eq!(placements[0].devid, 1);
1525        assert_eq!(placements[0].physical, 0x1000 + 4096);
1526        assert_eq!(placements[0].buf_offset, 0);
1527        assert_eq!(placements[0].len, 16384);
1528        assert_eq!(placements[1].devid, 1);
1529        assert_eq!(placements[1].physical, 0x2_0000 + 4096);
1530        assert_eq!(placements[1].buf_offset, 0);
1531        assert_eq!(placements[1].len, 16384);
1532    }
1533
1534    #[test]
1535    fn plan_write_raid1_writes_all_mirrors() {
1536        let m = make_chunk(
1537            raw::BTRFS_BLOCK_GROUP_RAID1,
1538            2,
1539            1,
1540            65536,
1541            1 << 20,
1542            &[(1, 0x1000), (2, 0x2000)],
1543        );
1544        let cache = cache_with(m);
1545        let placements = cache.plan_write(0, 8192).unwrap().unwrap_plain();
1546        assert_eq!(placements.len(), 2);
1547        assert_eq!(placements[0].devid, 1);
1548        assert_eq!(placements[1].devid, 2);
1549        for p in &placements {
1550            assert_eq!(p.buf_offset, 0);
1551            assert_eq!(p.len, 8192);
1552        }
1553    }
1554
1555    #[test]
1556    fn plan_write_raid1c3_writes_three_mirrors() {
1557        let m = make_chunk(
1558            raw::BTRFS_BLOCK_GROUP_RAID1C3,
1559            3,
1560            1,
1561            65536,
1562            1 << 20,
1563            &[(1, 0x1000), (2, 0x2000), (3, 0x3000)],
1564        );
1565        let cache = cache_with(m);
1566        let placements = cache.plan_write(0, 8192).unwrap().unwrap_plain();
1567        assert_eq!(placements.len(), 3);
1568        assert_eq!(placements[0].devid, 1);
1569        assert_eq!(placements[1].devid, 2);
1570        assert_eq!(placements[2].devid, 3);
1571    }
1572
1573    #[test]
1574    fn plan_write_raid1c4_writes_four_mirrors() {
1575        let m = make_chunk(
1576            raw::BTRFS_BLOCK_GROUP_RAID1C4,
1577            4,
1578            1,
1579            65536,
1580            1 << 20,
1581            &[(1, 0x1000), (2, 0x2000), (3, 0x3000), (4, 0x4000)],
1582        );
1583        let cache = cache_with(m);
1584        let placements = cache.plan_write(0, 8192).unwrap().unwrap_plain();
1585        assert_eq!(placements.len(), 4);
1586        assert_eq!(placements[3].devid, 4);
1587    }
1588
1589    // --- RAID0 striping ---
1590
1591    #[test]
1592    fn plan_write_raid0_routes_first_row_to_column_zero() {
1593        // 2 devices, stripe_len=64K. Write at logical 0 of length 4K
1594        // goes to stripe[0].
1595        let m = make_chunk(
1596            raw::BTRFS_BLOCK_GROUP_RAID0,
1597            2,
1598            1,
1599            65536,
1600            2 << 20,
1601            &[(1, 0x10000), (2, 0x20000)],
1602        );
1603        let cache = cache_with(m);
1604        let placements = cache.plan_write(0, 4096).unwrap().unwrap_plain();
1605        assert_eq!(placements.len(), 1);
1606        assert_eq!(placements[0].devid, 1);
1607        assert_eq!(placements[0].physical, 0x10000);
1608        assert_eq!(placements[0].len, 4096);
1609    }
1610
1611    #[test]
1612    fn plan_write_raid0_second_row_routes_to_second_device() {
1613        // Same chunk, write at logical = STRIPE_LEN (row 1) goes to stripe[1].
1614        // The per-device stripe number is row 1 / factor(2) = 0; physical
1615        // = base + 0 = base.
1616        let m = make_chunk(
1617            raw::BTRFS_BLOCK_GROUP_RAID0,
1618            2,
1619            1,
1620            65536,
1621            2 << 20,
1622            &[(1, 0x10000), (2, 0x20000)],
1623        );
1624        let cache = cache_with(m);
1625        let placements = cache.plan_write(65536, 4096).unwrap().unwrap_plain();
1626        assert_eq!(placements.len(), 1);
1627        assert_eq!(placements[0].devid, 2);
1628        assert_eq!(placements[0].physical, 0x20000);
1629    }
1630
1631    #[test]
1632    fn plan_write_raid0_third_row_wraps_to_first_device() {
1633        // Row 2 (logical = 2 * STRIPE_LEN) wraps back to stripe[0],
1634        // but the per-device stripe number is 1 (advances on the device).
1635        let m = make_chunk(
1636            raw::BTRFS_BLOCK_GROUP_RAID0,
1637            2,
1638            1,
1639            65536,
1640            2 << 20,
1641            &[(1, 0x10000), (2, 0x20000)],
1642        );
1643        let cache = cache_with(m);
1644        let placements =
1645            cache.plan_write(2 * 65536, 4096).unwrap().unwrap_plain();
1646        assert_eq!(placements.len(), 1);
1647        assert_eq!(placements[0].devid, 1);
1648        assert_eq!(placements[0].physical, 0x10000 + 65536);
1649    }
1650
1651    #[test]
1652    fn plan_write_raid0_spans_multiple_rows_round_robins_devices() {
1653        // 3 devices, stripe_len=64K. Write 192K starting at logical 0:
1654        // row 0 -> dev 1, row 1 -> dev 2, row 2 -> dev 3.
1655        let m = make_chunk(
1656            raw::BTRFS_BLOCK_GROUP_RAID0,
1657            3,
1658            1,
1659            65536,
1660            6 << 20,
1661            &[(1, 0x10000), (2, 0x20000), (3, 0x30000)],
1662        );
1663        let cache = cache_with(m);
1664        let placements =
1665            cache.plan_write(0, 192 * 1024).unwrap().unwrap_plain();
1666        assert_eq!(placements.len(), 3);
1667        for (i, p) in placements.iter().enumerate() {
1668            assert_eq!(p.devid, (i + 1) as u64);
1669            assert_eq!(p.buf_offset, i * 65536);
1670            assert_eq!(p.len, 65536);
1671        }
1672        // All three rows are stripe_nr 0, 1, 2; per_device_stripe_nr = 0.
1673        for p in &placements {
1674            // Each device's base physical is 0x{i+1}_0000; per-device
1675            // offset within is 0.
1676            assert_eq!(p.physical & 0xFFFF, 0);
1677        }
1678    }
1679
1680    #[test]
1681    fn plan_write_raid0_partial_first_row_then_full_then_partial() {
1682        // 2 devices, stripe_len 64K. Start mid-row: logical = 32K, len = 96K.
1683        // Row 0: dev 1, 32K..64K (32K bytes), buf 0..32K.
1684        // Row 1: dev 2, 0..64K (64K bytes), buf 32K..96K.
1685        let m = make_chunk(
1686            raw::BTRFS_BLOCK_GROUP_RAID0,
1687            2,
1688            1,
1689            65536,
1690            2 << 20,
1691            &[(1, 0x10000), (2, 0x20000)],
1692        );
1693        let cache = cache_with(m);
1694        let placements = cache
1695            .plan_write(32 * 1024, 96 * 1024)
1696            .unwrap()
1697            .unwrap_plain();
1698        assert_eq!(placements.len(), 2);
1699        assert_eq!(placements[0].devid, 1);
1700        assert_eq!(placements[0].physical, 0x10000 + 32 * 1024);
1701        assert_eq!(placements[0].buf_offset, 0);
1702        assert_eq!(placements[0].len, 32 * 1024);
1703        assert_eq!(placements[1].devid, 2);
1704        assert_eq!(placements[1].physical, 0x20000);
1705        assert_eq!(placements[1].buf_offset, 32 * 1024);
1706        assert_eq!(placements[1].len, 64 * 1024);
1707    }
1708
1709    // --- RAID10 striped mirrors ---
1710
1711    #[test]
1712    fn plan_write_raid10_first_row_writes_pair_zero() {
1713        // 4 stripes, sub_stripes=2: 2 mirror pairs.
1714        // Row 0 -> pair 0 = stripes[0,1] = devs (1, 2).
1715        let m = make_chunk(
1716            raw::BTRFS_BLOCK_GROUP_RAID10,
1717            4,
1718            2,
1719            65536,
1720            4 << 20,
1721            &[(1, 0x10000), (2, 0x20000), (3, 0x30000), (4, 0x40000)],
1722        );
1723        let cache = cache_with(m);
1724        let placements = cache.plan_write(0, 4096).unwrap().unwrap_plain();
1725        assert_eq!(placements.len(), 2);
1726        assert_eq!(placements[0].devid, 1);
1727        assert_eq!(placements[0].physical, 0x10000);
1728        assert_eq!(placements[1].devid, 2);
1729        assert_eq!(placements[1].physical, 0x20000);
1730        for p in &placements {
1731            assert_eq!(p.buf_offset, 0);
1732            assert_eq!(p.len, 4096);
1733        }
1734    }
1735
1736    #[test]
1737    fn plan_write_raid10_second_row_writes_pair_one() {
1738        let m = make_chunk(
1739            raw::BTRFS_BLOCK_GROUP_RAID10,
1740            4,
1741            2,
1742            65536,
1743            4 << 20,
1744            &[(1, 0x10000), (2, 0x20000), (3, 0x30000), (4, 0x40000)],
1745        );
1746        let cache = cache_with(m);
1747        // Row 1 (logical = STRIPE_LEN) -> pair 1 = stripes[2,3] = devs (3, 4).
1748        let placements = cache.plan_write(65536, 4096).unwrap().unwrap_plain();
1749        assert_eq!(placements.len(), 2);
1750        assert_eq!(placements[0].devid, 3);
1751        assert_eq!(placements[1].devid, 4);
1752    }
1753
1754    #[test]
1755    fn plan_write_raid10_wraps_after_factor_rows() {
1756        // 4-stripe RAID10: factor = 4/2 = 2. Row 2 wraps back to pair 0
1757        // but advances per-device offset.
1758        let m = make_chunk(
1759            raw::BTRFS_BLOCK_GROUP_RAID10,
1760            4,
1761            2,
1762            65536,
1763            4 << 20,
1764            &[(1, 0x10000), (2, 0x20000), (3, 0x30000), (4, 0x40000)],
1765        );
1766        let cache = cache_with(m);
1767        let placements =
1768            cache.plan_write(2 * 65536, 4096).unwrap().unwrap_plain();
1769        assert_eq!(placements.len(), 2);
1770        assert_eq!(placements[0].devid, 1);
1771        assert_eq!(placements[0].physical, 0x10000 + 65536);
1772        assert_eq!(placements[1].devid, 2);
1773        assert_eq!(placements[1].physical, 0x20000 + 65536);
1774    }
1775
1776    // --- plan_read ---
1777
1778    #[test]
1779    fn plan_read_picks_first_mirror_per_row() {
1780        // RAID1 read: only one placement (the first mirror).
1781        let m = make_chunk(
1782            raw::BTRFS_BLOCK_GROUP_RAID1,
1783            2,
1784            1,
1785            65536,
1786            1 << 20,
1787            &[(1, 0x1000), (2, 0x2000)],
1788        );
1789        let cache = cache_with(m);
1790        let placements = cache.plan_read(0, 8192).unwrap();
1791        assert_eq!(placements.len(), 1);
1792        assert_eq!(placements[0].devid, 1);
1793    }
1794
1795    #[test]
1796    fn plan_read_raid10_picks_first_mirror_per_row() {
1797        // RAID10 spanning two rows: 2 placements, each on one device only.
1798        let m = make_chunk(
1799            raw::BTRFS_BLOCK_GROUP_RAID10,
1800            4,
1801            2,
1802            65536,
1803            4 << 20,
1804            &[(1, 0x10000), (2, 0x20000), (3, 0x30000), (4, 0x40000)],
1805        );
1806        let cache = cache_with(m);
1807        let placements = cache.plan_read(0, 128 * 1024).unwrap();
1808        assert_eq!(placements.len(), 2);
1809        // Row 0 mirror pair (1,2) -> first = dev 1.
1810        assert_eq!(placements[0].devid, 1);
1811        // Row 1 mirror pair (3,4) -> first = dev 3.
1812        assert_eq!(placements[1].devid, 3);
1813    }
1814
1815    #[test]
1816    fn plan_read_raid0_matches_plan_write() {
1817        // Read planning for a striped profile is identical to write
1818        // planning (RAID0 has only one column per row anyway).
1819        let m = make_chunk(
1820            raw::BTRFS_BLOCK_GROUP_RAID0,
1821            2,
1822            1,
1823            65536,
1824            2 << 20,
1825            &[(1, 0x10000), (2, 0x20000)],
1826        );
1827        let cache = cache_with(m);
1828        let r = cache.plan_read(32 * 1024, 96 * 1024).unwrap();
1829        let w = cache
1830            .plan_write(32 * 1024, 96 * 1024)
1831            .unwrap()
1832            .unwrap_plain();
1833        assert_eq!(r, w);
1834    }
1835
1836    // --- Bounds and error cases ---
1837
1838    #[test]
1839    fn plan_write_out_of_chunk_returns_none() {
1840        let m = make_chunk(0, 1, 1, 65536, 1 << 20, &[(1, 0)]);
1841        let cache = cache_with(m);
1842        // Request straddles the chunk end.
1843        assert!(cache.plan_write((1 << 20) - 4096, 8192).is_none());
1844        // Request entirely past the chunk.
1845        assert!(cache.plan_write(2 << 20, 4096).is_none());
1846        // Empty request still succeeds (zero placements).
1847        let p = cache.plan_write(0, 0).unwrap().unwrap_plain();
1848        assert!(p.is_empty());
1849    }
1850
1851    #[test]
1852    fn plan_write_unmapped_returns_none() {
1853        let cache = ChunkTreeCache::default();
1854        assert!(cache.plan_write(0, 4096).is_none());
1855    }
1856
1857    #[test]
1858    fn plan_write_raid5_returns_parity_plan() {
1859        let m = make_chunk(
1860            raw::BTRFS_BLOCK_GROUP_RAID5,
1861            3,
1862            1,
1863            65536,
1864            3 << 20,
1865            &[(1, 0), (2, 0), (3, 0)],
1866        );
1867        let cache = cache_with(m);
1868        let plan = cache.plan_write(0, 4096).unwrap();
1869        match plan {
1870            WritePlan::Parity(_) => {}
1871            WritePlan::Plain(_) => panic!("expected Parity plan for RAID5"),
1872        }
1873    }
1874
1875    // --- RAID5 / RAID6 plan_write parity routing ---
1876
1877    /// Helper for parity-plan tests: build a RAID5/RAID6 chunk with
1878    /// per-device offsets that match the column index, so a column's
1879    /// physical address tells you which devid (and therefore which
1880    /// column) the executor will write to.
1881    fn make_parity_chunk(
1882        chunk_type_bit: u32,
1883        num_stripes: u16,
1884        stripe_len: u64,
1885        length: u64,
1886    ) -> ChunkMapping {
1887        // Devid `i+1` lives at physical `0x10_0000 + i * 0x10_0000`.
1888        let stripes: Vec<(u64, u64)> = (0..num_stripes)
1889            .map(|i| (u64::from(i) + 1, 0x10_0000 + u64::from(i) * 0x10_0000))
1890            .collect();
1891        make_chunk(chunk_type_bit, num_stripes, 1, stripe_len, length, &stripes)
1892    }
1893
1894    #[test]
1895    fn plan_write_raid5_three_devices_single_row() {
1896        // 3-device RAID5, stripe_len=64K. Tree-block-style write of 16K
1897        // at logical 0: covers data column 0 of physical row 0. Column
1898        // layout for row 0: P at col 2 (= 3-1-0), data cols are 0, 1.
1899        let m =
1900            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
1901        let cache = cache_with(m);
1902        let WritePlan::Parity(plan) = cache.plan_write(0, 16 * 1024).unwrap()
1903        else {
1904            panic!("expected Parity");
1905        };
1906        assert_eq!(plan.stripe_len, 65536);
1907        assert_eq!(plan.rows.len(), 1);
1908        let row = &plan.rows[0];
1909        // Two data columns, one parity target.
1910        assert_eq!(row.data_columns.len(), 2);
1911        assert_eq!(row.parity_targets.len(), 1);
1912        assert_eq!(row.parity_targets[0].kind, ParityKind::P);
1913        // Parity column is col 2 -> devid 3.
1914        assert_eq!(row.parity_targets[0].devid, 3);
1915        // Data column 0 of the row is physical col 0 -> devid 1, with
1916        // overlay covering [0, 16384).
1917        assert_eq!(row.data_columns[0].devid, 1);
1918        let ov = row.data_columns[0].overlay.as_ref().unwrap();
1919        assert_eq!(ov.slot_offset, 0);
1920        assert_eq!(ov.buf_offset, 0);
1921        assert_eq!(ov.len, 16 * 1024);
1922        // Data column 1 is physical col 1 -> devid 2, untouched.
1923        assert_eq!(row.data_columns[1].devid, 2);
1924        assert!(row.data_columns[1].overlay.is_none());
1925    }
1926
1927    #[test]
1928    fn plan_write_raid5_data_column_rotation() {
1929        // Walk physical rows of a 4-device RAID5 chunk and verify the
1930        // parity column rotates: row r -> P at col (4-1-r) mod 4.
1931        let m =
1932            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 4, 65536, 8 << 20);
1933        let cache = cache_with(m);
1934        // data_per_row = 3, logical_per_phys_row = 3 * 64K = 192K.
1935        let row_bytes = 192 * 1024;
1936        for phys_row in 0u64..4 {
1937            let logical = phys_row * row_bytes;
1938            let WritePlan::Parity(plan) =
1939                cache.plan_write(logical, 16 * 1024).unwrap()
1940            else {
1941                panic!("expected Parity");
1942            };
1943            let row = &plan.rows[0];
1944            let expected_p_col = ((4 - 1 - phys_row) % 4) as u16;
1945            // Map col index to devid (devid = col + 1).
1946            assert_eq!(
1947                row.parity_targets[0].devid,
1948                u64::from(expected_p_col) + 1,
1949                "parity row {phys_row}",
1950            );
1951        }
1952    }
1953
1954    #[test]
1955    fn plan_write_raid6_four_devices_single_row() {
1956        // 4-device RAID6, stripe_len=64K. Two data, two parity. Row 0:
1957        // P at col 2 (4-2-0), Q at col 3 (4-1-0). Data cols: 0, 1.
1958        // Single 16K write at logical 0.
1959        let m =
1960            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID6, 4, 65536, 4 << 20);
1961        let cache = cache_with(m);
1962        let WritePlan::Parity(plan) = cache.plan_write(0, 16 * 1024).unwrap()
1963        else {
1964            panic!("expected Parity");
1965        };
1966        let row = &plan.rows[0];
1967        assert_eq!(row.data_columns.len(), 2);
1968        assert_eq!(row.parity_targets.len(), 2);
1969        assert_eq!(row.parity_targets[0].kind, ParityKind::P);
1970        assert_eq!(row.parity_targets[1].kind, ParityKind::Q);
1971        // P col = 2 -> devid 3, Q col = 3 -> devid 4.
1972        assert_eq!(row.parity_targets[0].devid, 3);
1973        assert_eq!(row.parity_targets[1].devid, 4);
1974        assert_eq!(row.data_columns[0].devid, 1);
1975        assert_eq!(row.data_columns[1].devid, 2);
1976    }
1977
1978    #[test]
1979    fn plan_write_raid56_partial_row_overlay_offsets() {
1980        // Write that covers only the middle 4K of one column slot. The
1981        // overlay must report slot_offset = the offset within the slot,
1982        // not the chunk-wide offset. Other column has no overlay.
1983        let m =
1984            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
1985        let cache = cache_with(m);
1986        // Logical 0x4000 -> stripe_nr 0, stripe_offset 0x4000. Row 0,
1987        // data col 0 (physical col 0, devid 1).
1988        let WritePlan::Parity(plan) = cache.plan_write(0x4000, 0x1000).unwrap()
1989        else {
1990            panic!("expected Parity");
1991        };
1992        let row = &plan.rows[0];
1993        let ov = row.data_columns[0].overlay.as_ref().unwrap();
1994        assert_eq!(ov.slot_offset, 0x4000);
1995        assert_eq!(ov.len, 0x1000);
1996        assert_eq!(ov.buf_offset, 0);
1997        assert!(row.data_columns[1].overlay.is_none());
1998    }
1999
2000    #[test]
2001    fn plan_write_raid5_spanning_two_data_columns_in_one_row() {
2002        // 3-device RAID5: row 0 has 2 data slots (each 64K of logical).
2003        // Logical 0..128K covers both data columns of row 0 fully.
2004        // Both data columns should have overlay covering full slot;
2005        // one row total.
2006        let m =
2007            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
2008        let cache = cache_with(m);
2009        let WritePlan::Parity(plan) = cache.plan_write(0, 128 * 1024).unwrap()
2010        else {
2011            panic!("expected Parity");
2012        };
2013        assert_eq!(plan.rows.len(), 1);
2014        let row = &plan.rows[0];
2015        // Both data columns covered by caller bytes.
2016        for (i, dc) in row.data_columns.iter().enumerate() {
2017            let ov = dc.overlay.as_ref().expect("data col overlay");
2018            assert_eq!(ov.slot_offset, 0);
2019            assert_eq!(ov.len, 65536);
2020            assert_eq!(ov.buf_offset, i * 65536);
2021        }
2022    }
2023
2024    #[test]
2025    fn plan_write_raid5_spans_two_physical_rows() {
2026        // 3-device RAID5: logical_per_phys_row = 128K. Write 192K
2027        // starting at 64K: covers (logical 64K..128K, dat col 1 of row 0)
2028        // + (logical 128K..256K, both data cols of row 1).
2029        let m =
2030            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
2031        let cache = cache_with(m);
2032        let WritePlan::Parity(plan) =
2033            cache.plan_write(64 * 1024, 192 * 1024).unwrap()
2034        else {
2035            panic!("expected Parity");
2036        };
2037        assert_eq!(plan.rows.len(), 2);
2038        // Row 0: P col = 2, data cols = (0, 1). Caller covers data col 1.
2039        let r0 = &plan.rows[0];
2040        assert!(r0.data_columns[0].overlay.is_none());
2041        let ov0 = r0.data_columns[1].overlay.as_ref().unwrap();
2042        assert_eq!(ov0.slot_offset, 0);
2043        assert_eq!(ov0.len, 65536);
2044        assert_eq!(ov0.buf_offset, 0);
2045        // Row 1: P col = 1 ((3-1-1) mod 3), data cols = phys 0 and 2.
2046        let r1 = &plan.rows[1];
2047        assert_eq!(r1.parity_targets[0].devid, 2);
2048        for (i, dc) in r1.data_columns.iter().enumerate() {
2049            let ov = dc.overlay.as_ref().unwrap();
2050            assert_eq!(ov.slot_offset, 0);
2051            assert_eq!(ov.len, 65536);
2052            // Buf offsets: row 0 consumed 64K, row 1 starts at 64K
2053            // and each col is 64K.
2054            assert_eq!(ov.buf_offset, 64 * 1024 + i * 65536);
2055        }
2056    }
2057
2058    // --- RAID5 / RAID6 plan_read ---
2059
2060    #[test]
2061    fn plan_read_raid5_routes_to_data_column() {
2062        // 3-device RAID5: read at logical 0 of 4K. Should land on data
2063        // column 0 of row 0 = physical col 0 = devid 1, ignoring parity.
2064        let m =
2065            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
2066        let cache = cache_with(m);
2067        let placements = cache.plan_read(0, 4096).unwrap();
2068        assert_eq!(placements.len(), 1);
2069        assert_eq!(placements[0].devid, 1);
2070        assert_eq!(placements[0].physical, 0x10_0000);
2071    }
2072
2073    #[test]
2074    fn plan_read_raid5_second_data_column_routes_to_devid_2() {
2075        // Logical = stripe_len = 64K -> stripe_nr 1, data col 1 of row 0
2076        // = physical col 1 = devid 2.
2077        let m =
2078            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
2079        let cache = cache_with(m);
2080        let placements = cache.plan_read(65536, 4096).unwrap();
2081        assert_eq!(placements.len(), 1);
2082        assert_eq!(placements[0].devid, 2);
2083    }
2084
2085    #[test]
2086    fn plan_read_raid5_advances_to_next_physical_row() {
2087        // Logical = 128K = 2 * stripe_len -> stripe_nr 2, phys_row 1
2088        // (since data_per_row = 2). Row 1 P col = 1 (= (3-1-1) mod 3),
2089        // so data cols are phys 0 and 2. data_col_in_row = 0 -> phys 0
2090        // = devid 1.
2091        let m =
2092            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
2093        let cache = cache_with(m);
2094        let placements = cache.plan_read(128 * 1024, 4096).unwrap();
2095        assert_eq!(placements.len(), 1);
2096        assert_eq!(placements[0].devid, 1);
2097        // Per-device stripe_nr = 1 (advanced on the device).
2098        assert_eq!(placements[0].physical, 0x10_0000 + 65536);
2099    }
2100
2101    #[test]
2102    fn plan_read_raid6_routes_skipping_two_parity_columns() {
2103        // 4-device RAID6, row 0: P col 2, Q col 3, data cols 0 and 1.
2104        // Logical 0 -> data col 0 -> phys 0 -> devid 1.
2105        let m =
2106            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID6, 4, 65536, 4 << 20);
2107        let cache = cache_with(m);
2108        let placements = cache.plan_read(0, 4096).unwrap();
2109        assert_eq!(placements.len(), 1);
2110        assert_eq!(placements[0].devid, 1);
2111    }
2112
2113    #[test]
2114    fn plan_write_raid5_buf_offsets_cover_request_exactly() {
2115        // Tile property: across all rows, the caller-overlay byte
2116        // ranges sum to the request length and tile [0, len) in the
2117        // caller's buffer without gaps or overlaps.
2118        let m =
2119            make_parity_chunk(raw::BTRFS_BLOCK_GROUP_RAID5, 3, 65536, 3 << 20);
2120        let cache = cache_with(m);
2121        let req_len = 200 * 1024;
2122        let WritePlan::Parity(plan) =
2123            cache.plan_write(8 * 1024, req_len).unwrap()
2124        else {
2125            panic!();
2126        };
2127        let mut overlays: Vec<&CallerOverlay> = plan
2128            .rows
2129            .iter()
2130            .flat_map(|r| r.data_columns.iter())
2131            .filter_map(|dc| dc.overlay.as_ref())
2132            .collect();
2133        overlays.sort_by_key(|o| o.buf_offset);
2134        let mut next = 0usize;
2135        for o in &overlays {
2136            assert_eq!(o.buf_offset, next);
2137            next += o.len as usize;
2138        }
2139        assert_eq!(next, req_len);
2140    }
2141
2142    #[test]
2143    fn plan_write_buf_offsets_cover_request_exactly() {
2144        // For a striped (no-mirror) profile, the sum of placement
2145        // lengths should equal the request length and the buf_offsets
2146        // should tile [0, len) without gaps.
2147        let m = make_chunk(
2148            raw::BTRFS_BLOCK_GROUP_RAID0,
2149            2,
2150            1,
2151            65536,
2152            4 << 20,
2153            &[(1, 0x10000), (2, 0x20000)],
2154        );
2155        let cache = cache_with(m);
2156        let placements =
2157            cache.plan_write(0, 256 * 1024).unwrap().unwrap_plain();
2158        let total: usize = placements.iter().map(|p| p.len).sum();
2159        assert_eq!(total, 256 * 1024);
2160        // Check buf_offsets tile contiguously.
2161        let mut next = 0;
2162        for p in &placements {
2163            assert_eq!(p.buf_offset, next);
2164            next += p.len;
2165        }
2166        assert_eq!(next, 256 * 1024);
2167    }
2168}