lsm_tree/
verify.rs

1// SPDX-License-Identifier: Apache-2.0
2// Copyright (c) 2024-present, fjall-rs
3// Copyright (c) 2026-present, Structured World Foundation
4
5use crate::path::{Path, PathBuf};
6use crate::{checksum::Checksum, coding::Decode, io, table::TableId, table::block::Header};
7#[cfg(not(feature = "std"))]
8use alloc::{boxed::Box, string::String, vec::Vec};
9
10/// Describes a single integrity error found during verification.
11///
12/// Full-file integrity (hashing whole files by path) uses `std::fs` directly and
13/// is gated to `std`; the `no_std` verify path is block-level over the injected
14/// [`Fs`](crate::fs::Fs) backend (see [`verify_block_checksums`]).
15#[cfg(feature = "std")]
16#[derive(Debug)]
17#[non_exhaustive]
18pub enum IntegrityError {
19    /// Full-file checksum mismatch for an SST table.
20    SstFileCorrupted {
21        /// Table ID
22        table_id: TableId,
23        /// Path to the corrupted file
24        path: PathBuf,
25        /// Checksum stored in the manifest
26        expected: Checksum,
27        /// Checksum computed from disk
28        got: Checksum,
29    },
30
31    /// Full-file checksum mismatch for a blob file.
32    BlobFileCorrupted {
33        /// Blob file ID
34        blob_file_id: u64,
35        /// Path to the corrupted file
36        path: PathBuf,
37        /// Checksum stored in the manifest
38        expected: Checksum,
39        /// Checksum computed from disk
40        got: Checksum,
41    },
42
43    /// I/O error while reading a file during verification.
44    IoError {
45        /// Path to the file that could not be read
46        path: PathBuf,
47        /// The underlying I/O error
48        error: io::Error,
49    },
50}
51
52#[cfg(feature = "std")]
53impl core::fmt::Display for IntegrityError {
54    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
55        match self {
56            Self::SstFileCorrupted {
57                table_id,
58                path,
59                expected,
60                got,
61            } => write!(
62                f,
63                "SST table {table_id} corrupted at {}: expected {expected}, got {got}",
64                path.display()
65            ),
66            Self::BlobFileCorrupted {
67                blob_file_id,
68                path,
69                expected,
70                got,
71            } => write!(
72                f,
73                "blob file {blob_file_id} corrupted at {}: expected {expected}, got {got}",
74                path.display()
75            ),
76            Self::IoError { path, error } => {
77                write!(f, "I/O error reading {}: {}", path.display(), error)
78            }
79        }
80    }
81}
82
83#[cfg(feature = "std")]
84impl core::error::Error for IntegrityError {
85    fn source(&self) -> Option<&(dyn core::error::Error + 'static)> {
86        match self {
87            Self::IoError { error, .. } => Some(error),
88            _ => None,
89        }
90    }
91}
92
93/// Result of an integrity verification scan.
94///
95/// The `sst_files_checked` and `blob_files_checked` counters reflect
96/// the number of files *attempted* — including those that produced I/O
97/// errors. This lets callers reconcile the total against the manifest
98/// even when some files were unreadable.
99#[cfg(feature = "std")]
100#[derive(Debug)]
101#[non_exhaustive]
102pub struct IntegrityReport {
103    /// Number of SST table files checked (includes I/O errors).
104    pub sst_files_checked: usize,
105
106    /// Number of blob files checked (includes I/O errors).
107    pub blob_files_checked: usize,
108
109    /// Integrity errors found during verification.
110    pub errors: Vec<IntegrityError>,
111}
112
113#[cfg(feature = "std")]
114impl IntegrityReport {
115    /// Returns `true` if no errors were found.
116    #[must_use]
117    pub fn is_ok(&self) -> bool {
118        self.errors.is_empty()
119    }
120
121    /// Total number of files checked (SST + blob).
122    #[must_use]
123    pub fn files_checked(&self) -> usize {
124        self.sst_files_checked + self.blob_files_checked
125    }
126}
127
128/// Computes a streaming XXH3 128-bit checksum for a file without loading it entirely into memory.
129///
130/// `pub(crate)` so [`crate::salvage`] can stamp the salvaged-source open with
131/// the file's current digest (the source may be corrupt, so its digest is
132/// whatever bytes are on disk; per-block checksums catch the actual damage).
133#[cfg(feature = "std")]
134pub(crate) fn stream_checksum(path: &std::path::Path) -> std::io::Result<Checksum> {
135    use std::io::Read;
136
137    let mut reader = std::fs::File::open(path)?;
138    let mut hasher = xxhash_rust::xxh3::Xxh3Default::new();
139    let mut buf = vec![0u8; 64 * 1024];
140
141    loop {
142        let n = match reader.read(&mut buf) {
143            Ok(n) => n,
144            Err(e) if e.kind() == std::io::ErrorKind::Interrupted => continue,
145            Err(e) => return Err(e),
146        };
147        if n == 0 {
148            break;
149        }
150        // Safety: Read::read guarantees n <= buf.len(), so get(..n) always
151        // returns Some. We use .get() instead of direct indexing to satisfy
152        // the crate-wide #[deny(clippy::indexing_slicing)] lint.
153        if let Some(chunk) = buf.get(..n) {
154            hasher.update(chunk);
155        }
156    }
157
158    Ok(Checksum::from_raw(hasher.digest128()))
159}
160
161/// Verifies full-file checksums for all SST and blob files in the given tree.
162///
163/// Each file's content is read from disk and hashed with XXHash-3 128-bit,
164/// then compared against the checksum stored in the version manifest.
165///
166/// This detects silent bit-rot, partial writes, and other on-disk corruption.
167///
168/// Per-file errors (e.g., unreadable files, checksum mismatches) are collected
169/// into [`IntegrityReport::errors`] — the scan always runs to completion.
170#[cfg(feature = "std")]
171#[must_use]
172pub fn verify_integrity(tree: &impl crate::AbstractTree) -> IntegrityReport {
173    let version = tree.current_version();
174
175    let mut report = IntegrityReport {
176        sst_files_checked: 0,
177        blob_files_checked: 0,
178        errors: Vec::new(),
179    };
180
181    // Verify all SST table files
182    for table in version.iter_tables() {
183        let path = &*table.path;
184        let expected = table.checksum();
185
186        match stream_checksum(path) {
187            Ok(got) if got != expected => {
188                report.errors.push(IntegrityError::SstFileCorrupted {
189                    table_id: table.id(),
190                    path: (*table.path).clone(),
191                    expected,
192                    got,
193                });
194            }
195            Ok(_) => {}
196            Err(e) => {
197                report.errors.push(IntegrityError::IoError {
198                    path: (*table.path).clone(),
199                    error: e.into(),
200                });
201            }
202        }
203
204        report.sst_files_checked += 1;
205    }
206
207    // Verify all blob files
208    for blob_file in version.blob_files.iter() {
209        let path = blob_file.path();
210        let expected = blob_file.checksum();
211
212        match stream_checksum(path) {
213            Ok(got) if got != expected => {
214                report.errors.push(IntegrityError::BlobFileCorrupted {
215                    blob_file_id: blob_file.id(),
216                    path: path.to_path_buf(),
217                    expected,
218                    got,
219                });
220            }
221            Ok(_) => {}
222            Err(e) => {
223                report.errors.push(IntegrityError::IoError {
224                    path: path.to_path_buf(),
225                    error: e.into(),
226                });
227            }
228        }
229
230        report.blob_files_checked += 1;
231    }
232
233    report
234}
235
236// ── Block-level scrub ─────────────────────────────────────────────────────
237// `verify_integrity` above hashes each SST as one opaque byte stream and
238// compares the digest to the per-file checksum stored in the manifest. That
239// catches whole-file corruption but identifies the bad region only at file
240// granularity. The functions below walk every block inside every SST and
241// verify per-block XXH3 against the value embedded in each block's own
242// header, so a corrupt block can be reported with its exact `(file, offset)`
243// without re-running the manifest-level scan.
244
245/// Per-block verification error.
246#[derive(Debug)]
247#[non_exhaustive]
248pub enum BlockVerifyError {
249    /// SST file could not be opened or its trailer parsed.
250    SstFileUnreadable {
251        /// Table ID.
252        table_id: TableId,
253        /// Path to the SST file.
254        path: PathBuf,
255        /// Underlying I/O / format error.
256        error: io::Error,
257    },
258
259    /// A block header at the given offset failed to parse — either
260    /// XXH3 mismatch on the header itself, or invalid magic bytes /
261    /// length fields that point at on-disk corruption.
262    HeaderCorrupted {
263        /// Table ID.
264        table_id: TableId,
265        /// Path to the SST file.
266        path: PathBuf,
267        /// File offset where the corrupt header was read from.
268        offset: u64,
269        /// Short description of the failure surfaced by header decoding.
270        reason: String,
271    },
272
273    /// A block's data XXH3 did not match the value stored in its header.
274    /// Indicates bit-rot or torn write on the block payload.
275    DataCorrupted {
276        /// Table ID.
277        table_id: TableId,
278        /// Path to the SST file.
279        path: PathBuf,
280        /// File offset where the block header sits (the data follows it).
281        offset: u64,
282        /// Length of the on-disk data segment, in bytes.
283        data_length: u32,
284        /// Checksum stored in the block header.
285        expected: Checksum,
286        /// Checksum computed from the on-disk bytes.
287        got: Checksum,
288    },
289
290    /// The block header was successfully decoded (its own XXH3
291    /// matched) but the subsequent fixed-length read of the data
292    /// segment failed at the filesystem layer — truncated file,
293    /// unexpected EOF, transient I/O error. Distinct from
294    /// `HeaderCorrupted` because the header itself was clean: the
295    /// failure is on the bytes that should follow it.
296    DataReadError {
297        /// Table ID.
298        table_id: TableId,
299        /// Path to the SST file.
300        path: PathBuf,
301        /// File offset where the (clean) header sits; the read for
302        /// its data segment started at `offset + Header::header_len(block_type)`.
303        offset: u64,
304        /// Length the (clean) header advertised for the data segment.
305        data_length: u32,
306        /// Underlying I/O error from the failed data-segment read.
307        /// Kept as `std::io::Error` (matching `SstFileUnreadable`) so
308        /// `ErrorKind` / OS code stay available to callers and so
309        /// `Error::source()` produces a coherent chain.
310        error: io::Error,
311    },
312
313    /// SFA TOC-level corruption: a named section's length / position
314    /// fields are inconsistent (overflow on addition), or seeking to
315    /// its declared start offset fails before any block is read.
316    /// Distinct from `HeaderCorrupted` (which is per-block) so
317    /// callers can tell "the section catalogue itself is bad" apart
318    /// from "block N inside an otherwise-walkable section is bad" —
319    /// e.g. a `TocCorrupted` makes the whole section unreachable,
320    /// while a `HeaderCorrupted` only stops that section's walk.
321    TocCorrupted {
322        /// Table ID.
323        table_id: TableId,
324        /// Path to the SST file.
325        path: PathBuf,
326        /// Section name from the TOC entry (e.g. `b"data"`,
327        /// `b"tli"`). Stored verbatim, not lossy-decoded, because
328        /// SFA section names are byte strings.
329        section_name: Vec<u8>,
330        /// File offset where the section *would* start per the TOC
331        /// entry. Useful for forensics even when the start is
332        /// unreachable.
333        section_offset: u64,
334        /// Short description of the failure (overflow on
335        /// start+length, seek error, etc.).
336        reason: String,
337    },
338}
339
340impl core::fmt::Display for BlockVerifyError {
341    fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
342        match self {
343            Self::SstFileUnreadable {
344                table_id,
345                path,
346                error,
347            } => write!(
348                f,
349                "SST table {table_id} at {} could not be opened/parsed: {error}",
350                path.display(),
351            ),
352            Self::HeaderCorrupted {
353                table_id,
354                path,
355                offset,
356                reason,
357            } => write!(
358                f,
359                "SST table {table_id} at {}: block header at offset {offset} is corrupt ({reason})",
360                path.display(),
361            ),
362            Self::DataCorrupted {
363                table_id,
364                path,
365                offset,
366                data_length,
367                expected,
368                got,
369            } => write!(
370                f,
371                "SST table {table_id} at {}: block at offset {offset} ({data_length} bytes) data \
372                 checksum mismatch, expected {expected}, got {got}",
373                path.display(),
374            ),
375            Self::DataReadError {
376                table_id,
377                path,
378                offset,
379                data_length,
380                error,
381            } => write!(
382                f,
383                "SST table {table_id} at {}: failed to read {data_length}-byte data segment for \
384                 block at offset {offset}: {error}",
385                path.display(),
386            ),
387            Self::TocCorrupted {
388                table_id,
389                path,
390                section_name,
391                section_offset,
392                reason,
393            } => write!(
394                f,
395                "SST table {table_id} at {}: TOC section {:?} at offset {section_offset} is \
396                 unreachable ({reason})",
397                path.display(),
398                String::from_utf8_lossy(section_name),
399            ),
400        }
401    }
402}
403
404impl core::error::Error for BlockVerifyError {
405    fn source(&self) -> Option<&(dyn core::error::Error + 'static)> {
406        match self {
407            Self::SstFileUnreadable { error, .. } | Self::DataReadError { error, .. } => {
408                Some(error)
409            }
410            _ => None,
411        }
412    }
413}
414
415/// A non-fatal finding from a scrub run: the data is intact, but something
416/// about a table could not be fully checked.
417#[derive(Debug)]
418#[non_exhaustive]
419pub enum BlockVerifyWarning {
420    /// The table's `descriptor#page_ecc` decodes to an ECC scheme this build
421    /// cannot apply (an unimplemented scheme, page granularity, an unknown
422    /// kind, or a non-canonical descriptor). Block payloads still verify by
423    /// their own checksums, but the parity trailer length is not derivable
424    /// from a scheme, so the sequential block walk cannot size it and ECC
425    /// verification was skipped for this table. Recompaction re-stamps the
426    /// table with a supported scheme.
427    UnrecognizedEcc {
428        /// Table the warning applies to.
429        table_id: TableId,
430        /// On-disk path of the SST.
431        path: PathBuf,
432    },
433}
434
435/// Aggregated result of a per-block scrub run.
436#[derive(Debug, Default)]
437#[non_exhaustive]
438pub struct BlockVerifyReport {
439    /// Number of SST table files visited (one per scan).
440    pub sst_files_scanned: usize,
441    /// Total blocks successfully header-read across all SSTs. Includes
442    /// blocks where the data checksum subsequently failed.
443    pub blocks_scanned: usize,
444    /// Per-block errors collected during the scan. The scan always
445    /// runs to completion across all SSTs even if individual blocks
446    /// or whole files are corrupt.
447    pub errors: Vec<BlockVerifyError>,
448    /// Non-fatal findings: data verified, but ECC could not be checked for
449    /// some tables (unrecognized scheme — recompaction recommended). Distinct
450    /// from `errors`: warnings do NOT make [`Self::is_ok`] false.
451    pub warnings: Vec<BlockVerifyWarning>,
452}
453
454impl BlockVerifyReport {
455    /// `true` if every block in every SST verified clean. Warnings (e.g. an
456    /// unrecognized ECC scheme whose data still checksum-verified) do NOT
457    /// make this false — only real corruption (`errors`) does.
458    #[must_use]
459    pub fn is_ok(&self) -> bool {
460        self.errors.is_empty()
461    }
462
463    /// `true` if the scrub produced any non-fatal warning.
464    #[must_use]
465    pub fn has_warnings(&self) -> bool {
466        !self.warnings.is_empty()
467    }
468}
469
470/// Options for the block-checksum scrubber
471/// ([`verify_block_checksums_with`] / [`AbstractTree::verify_checksum_with`](crate::AbstractTree::verify_checksum_with)).
472#[derive(Clone, Debug)]
473pub struct VerifyOptions {
474    /// Number of SSTs to scan concurrently. Clamped to `>= 1` and to the table
475    /// count. `1` (the default) scans sequentially in table order with no
476    /// thread spawn. Per-SST scans are independent (each opens its own file
477    /// through the table's `Fs` handle), so they parallelize cleanly.
478    pub parallelism: usize,
479
480    /// Minimum delay each worker waits after finishing one SST before taking
481    /// the next, capping I/O pressure on a production box during a scrub.
482    /// `None` (default) runs at full speed.
483    pub throttle: Option<core::time::Duration>,
484}
485
486impl Default for VerifyOptions {
487    fn default() -> Self {
488        Self {
489            parallelism: 1,
490            throttle: None,
491        }
492    }
493}
494
495impl VerifyOptions {
496    /// Sets the number of SSTs to scan concurrently.
497    #[must_use]
498    pub const fn parallelism(mut self, workers: usize) -> Self {
499        self.parallelism = workers;
500        self
501    }
502
503    /// Sets the per-worker inter-SST throttle delay.
504    #[must_use]
505    pub const fn throttle(mut self, delay: core::time::Duration) -> Self {
506        self.throttle = Some(delay);
507        self
508    }
509}
510
511/// Merges a per-SST partial report into an accumulator.
512fn merge_report(dst: &mut BlockVerifyReport, src: BlockVerifyReport) {
513    dst.sst_files_scanned += src.sst_files_scanned;
514    dst.blocks_scanned += src.blocks_scanned;
515    dst.errors.extend(src.errors);
516    dst.warnings.extend(src.warnings);
517}
518
519/// Scans one SST and returns a partial report (`sst_files_scanned == 1`).
520///
521/// Self-contained per table: opens the file through the table's own `Fs`
522/// handle, sizes encryption overhead and ECC params from the table's
523/// descriptor, so it can run on its own worker thread without shared state.
524fn scan_one_table(table: &crate::table::Table) -> BlockVerifyReport {
525    let mut report = BlockVerifyReport {
526        sst_files_scanned: 1,
527        ..BlockVerifyReport::default()
528    };
529    let path: &Path = &table.path;
530    let table_id = table.id();
531
532    // Tables whose ECC descriptor decodes to a scheme this build can't apply
533    // can't have their SST-block parity trailers sized (the length isn't
534    // derivable without the scheme), so those sections are skipped with a
535    // warning rather than mis-walked. The self-describing `meta` / `meta_mid`
536    // sections are still walked (parity sized from their own `block_flags`),
537    // so corruption there is NOT downgraded. The per-block read path still
538    // serves the data (framed by data_length, checksum-verified), hence a
539    // warning, not an error.
540    let ecc_unrecognized = table.metadata.ecc_unrecognized;
541    if ecc_unrecognized {
542        log::warn!(
543            "table {table_id} at {}: unrecognized ECC scheme — skipping the \
544             ECC-dependent block sections; recompact to re-stamp with a \
545             supported scheme",
546            path.display(),
547        );
548        report.warnings.push(BlockVerifyWarning::UnrecognizedEcc {
549            table_id,
550            path: path.to_path_buf(),
551        });
552    }
553
554    // Use each Table's own `Fs` handle (StdFs, MemFs, IoUring, …).
555    // Encryption overhead is per-table (different keys / AEAD suites can attach
556    // to different SSTs), so feed each table's `max_overhead()` separately.
557    let max_enc_overhead = table.encryption.as_ref().map_or(0u32, |e| e.max_overhead());
558    match scan_sst_blocks(
559        &*table.fs,
560        path,
561        table_id,
562        max_enc_overhead,
563        table.metadata.ecc_params,
564        ecc_unrecognized,
565    ) {
566        Ok(per_file) => {
567            report.blocks_scanned += per_file.blocks_scanned;
568            report.errors.extend(per_file.errors);
569        }
570        Err(error) => {
571            report.errors.push(BlockVerifyError::SstFileUnreadable {
572                table_id,
573                path: path.to_path_buf(),
574                error,
575            });
576        }
577    }
578    report
579}
580
581/// Walks every block in every SST referenced by the tree's current
582/// version and verifies each block's XXH3 checksum.
583///
584/// Pipeline per SST:
585///
586/// 1. Open the file and parse the SFA trailer to obtain the TOC.
587/// 2. For each TOC section, skip if its name is in `RAW_FORMAT_SECTIONS`
588///    (those payloads are not `Header`-prefixed and are covered by the
589///    SFA-trailer checksum). Otherwise seek to the section's start
590///    offset and walk it as a contiguous block region in
591///    `[start, start + length)`.
592/// 3. Inside each block region, decode each block's `Header` (which
593///    validates the header's own XXH3), read the data segment, and
594///    compare a fresh XXH3 over the data against `header.checksum`.
595///    Advance by `Header::header_len(block_type) + data_length` until the
596///    section end. A corrupt header inside a section stops that
597///    section's walk and is reported; the next section is still walked.
598///
599/// This is the read-side scrub primitive: it catches the same bit-rot
600/// signal a live read would surface, ahead of time, with per-block
601/// `(file, offset)` granularity. Decompression and decryption errors
602/// are out of scope here — those depend on per-level/per-block context
603/// (compression policy, encryption key, dictionary) that the scrub
604/// path does not need to reach checksum-level corruption.
605#[must_use]
606pub fn verify_block_checksums(tree: &impl crate::AbstractTree) -> BlockVerifyReport {
607    verify_block_checksums_with(tree, &VerifyOptions::default())
608}
609
610/// Like [`verify_block_checksums`] but with configurable parallelism and
611/// throttle (see [`VerifyOptions`]).
612///
613/// With `parallelism == 1` (default) SSTs are scanned sequentially in table
614/// order. With `> 1`, up to that many worker threads pull SSTs from a shared
615/// cursor and scan them concurrently (each scan is independent — its own file
616/// handle through the table's `Fs`), then their partial reports are merged.
617/// Parallel runs report the same findings as a sequential run; only the order
618/// of `errors` / `warnings` may differ. `throttle` makes each worker pause
619/// between SSTs so a scrub does not saturate production I/O.
620#[must_use]
621pub fn verify_block_checksums_with(
622    tree: &impl crate::AbstractTree,
623    options: &VerifyOptions,
624) -> BlockVerifyReport {
625    let version = tree.current_version();
626    let tables: Vec<crate::table::Table> = version.iter_tables().cloned().collect();
627
628    // `parallelism` + `throttle` only drive the std thread-fan-out + sleep below.
629    #[cfg(not(feature = "std"))]
630    let _ = options;
631
632    // Parallel scan (std only): up to `parallelism` worker threads pull SSTs from
633    // a shared cursor and scan them concurrently. A `no_std` build has no
634    // threads, so it always takes the serial path below.
635    #[cfg(feature = "std")]
636    {
637        let workers = options.parallelism.max(1).min(tables.len().max(1));
638        if workers > 1 {
639            let cursor = core::sync::atomic::AtomicUsize::new(0);
640            let partials = std::thread::scope(|scope| {
641                let handles: Vec<_> = (0..workers)
642                    .map(|_| {
643                        scope.spawn(|| {
644                            let mut local = BlockVerifyReport::default();
645                            let mut idx =
646                                cursor.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
647                            while let Some(table) = tables.get(idx) {
648                                merge_report(&mut local, scan_one_table(table));
649                                // Claim the next SST first; only pause if this
650                                // worker actually has another table to scan.
651                                idx = cursor.fetch_add(1, core::sync::atomic::Ordering::Relaxed);
652                                if tables.get(idx).is_some()
653                                    && let Some(delay) = options.throttle
654                                {
655                                    std::thread::sleep(delay);
656                                }
657                            }
658                            local
659                        })
660                    })
661                    .collect();
662                handles
663                    .into_iter()
664                    .map(|handle| match handle.join() {
665                        Ok(local) => local,
666                        // A scrub worker panicking is a bug, not a corruption
667                        // finding — propagate rather than drop its SSTs.
668                        Err(payload) => std::panic::resume_unwind(payload),
669                    })
670                    .collect::<Vec<_>>()
671            });
672
673            let mut report = BlockVerifyReport::default();
674            for partial in partials {
675                merge_report(&mut report, partial);
676            }
677            return report;
678        }
679    }
680
681    // Serial scan: every `no_std` build, and `std` with `parallelism <= 1`. Scans
682    // SSTs in deterministic table order, each over its own `Fs` handle.
683    let mut report = BlockVerifyReport::default();
684    for (idx, table) in tables.iter().enumerate() {
685        merge_report(&mut report, scan_one_table(table));
686        // Inter-SST throttle (std only — `no_std` has no sleep primitive). Skip
687        // after the final table so a finished scrub returns promptly instead of
688        // waiting one extra throttle interval.
689        #[cfg(feature = "std")]
690        if idx + 1 < tables.len()
691            && let Some(delay) = options.throttle
692        {
693            std::thread::sleep(delay);
694        }
695        #[cfg(not(feature = "std"))]
696        let _ = idx;
697    }
698    report
699}
700
701/// Verifies the per-KV checksum footer of every data block across all SST
702/// tables in the tree (the paranoid / scrub integrity path).
703///
704/// Footer presence is a per-SST property read from each table's descriptor
705/// (`ParsedMeta::kv_checksum_algo`), not a per-block header flag — SST data
706/// blocks omit the `block_flags` byte. A table whose descriptor reports no
707/// footers is skipped wholesale.
708///
709/// This is stronger than [`verify_block_checksums`]: for footer-bearing
710/// tables it decodes each block and recomputes every entry's logical-content
711/// digest, localising which entry diverged rather than only flagging the
712/// block. Tables written without per-KV footers carry no per-KV digests and
713/// are covered by [`verify_block_checksums`] only.
714///
715/// Returns the first error encountered (`ChecksumMismatch` on a per-entry
716/// digest disagreement, or an I/O / decode error). `Ok(())` means every
717/// per-KV-checked table verified. A tree written entirely with
718/// `kv_checksums = Off` has no footer-bearing tables, so this is a no-op
719/// returning `Ok(())`.
720///
721/// # Errors
722///
723/// Propagates [`crate::Error::ChecksumMismatch`] on a detected per-entry
724/// corruption, or any I/O / decode error from loading a block.
725pub fn verify_kv_checksums(tree: &impl crate::AbstractTree) -> crate::Result<()> {
726    let version = tree.current_version();
727    for table in version.iter_tables() {
728        table.verify_kv_checksums()?;
729    }
730    Ok(())
731}
732
733/// Out-of-band variant of [`verify_block_checksums`].
734///
735/// Walks one SST file directly from a filesystem path, without
736/// needing a live `Tree` or the version manifest. Intended for
737/// offline diagnostic tools (`tools/sst-dump verify`, `repair_db`,
738/// forensics CLIs) that operate on a single file in isolation — for
739/// example when the manifest itself is corrupt or the surrounding
740/// tree directory has been moved.
741///
742/// Uses [`StdFs`](crate::fs::StdFs) (the only `Fs` backend that
743/// makes sense for an out-of-band tool — `MemFs` / `IoUring` trees
744/// never produce files at real filesystem paths) and stamps
745/// `table_id = 0` in error reports. The caller's downstream
746/// filtering / logging should refer to the file by path, not by
747/// table id.
748///
749/// AEAD overhead is conservatively assumed to be zero: out-of-band
750/// tools don't carry the per-table encryption provider that would let
751/// them recover the real `max_overhead()`. Encrypted SSTs near the
752/// 256 MiB plaintext ceiling may therefore false-flag as
753/// [`BlockVerifyError::HeaderCorrupted`]. In practice block sizes are
754/// typically a few KiB, so this only matters on artificially-
755/// constructed huge blocks; encrypted-aware verification should go
756/// through [`verify_block_checksums`] on a live tree.
757///
758/// The returned [`BlockVerifyReport`] has `sst_files_scanned == 1`
759/// (always) plus per-block errors collected during the walk.
760#[cfg(feature = "std")]
761#[must_use]
762pub fn verify_sst_file(path: &std::path::Path) -> BlockVerifyReport {
763    verify_sst_file_with_fs(&crate::fs::StdFs, path)
764}
765
766/// As [`verify_sst_file`], but reads `path` through the given filesystem.
767///
768/// `pub(crate)` so `repair` can block-verify an SST on the tree's own `Fs`
769/// before deciding whether to salvage it, rather than assuming `StdFs`.
770#[cfg(feature = "std")]
771pub(crate) fn verify_sst_file_with_fs(
772    fs: &dyn crate::fs::Fs,
773    path: &std::path::Path,
774) -> BlockVerifyReport {
775    let mut report = BlockVerifyReport {
776        sst_files_scanned: 1,
777        ..BlockVerifyReport::default()
778    };
779
780    // SST blocks omit the block_flags byte, so the parity-trailer presence and
781    // shard layout the walk must skip come from the per-SST ECC descriptor —
782    // read it from the meta block. If it can't be determined (corrupt meta, or
783    // an encrypted SST with no key out-of-band), DO NOT assume disabled:
784    // walking an ECC-bearing SST without skipping parity trailers mis-aligns
785    // the scan and reports spurious corruption. Surface the indeterminacy and
786    // skip the walk.
787    let mut ecc_unrecognized = false;
788    let ecc = match read_ecc_params_out_of_band(fs, path) {
789        Ok(Some(ScrubEcc::Off)) => None,
790        Ok(Some(ScrubEcc::Scheme(params))) => Some(params),
791        // The descriptor decodes to a scheme this build can't apply: the
792        // SST-block trailer length isn't derivable, so those sections are
793        // skipped during the walk. The self-describing `meta` / `meta_mid`
794        // sections still size parity from `block_flags`, so corruption there
795        // is NOT downgraded. Warn + continue (don't drop the whole scrub).
796        Ok(Some(ScrubEcc::Unrecognized)) => {
797            log::warn!(
798                "{}: unrecognized ECC scheme — skipping the ECC-dependent block \
799                 sections; recompact to re-stamp with a supported scheme",
800                path.display(),
801            );
802            report.warnings.push(BlockVerifyWarning::UnrecognizedEcc {
803                table_id: 0,
804                path: path.to_path_buf(),
805            });
806            ecc_unrecognized = true;
807            None
808        }
809        // File + trailer readable, but neither meta block decodes (corrupt
810        // meta, or an encrypted SST with no key out-of-band). The ECC scheme is
811        // undeterminable; skip the walk rather than mis-walk an ECC-bearing SST.
812        Ok(None) => {
813            report.errors.push(BlockVerifyError::SstFileUnreadable {
814                table_id: 0,
815                path: path.to_path_buf(),
816                error: io::Error::new(
817                    io::ErrorKind::InvalidData,
818                    "could not decode the SST meta block to determine the ECC scheme \
819                     (corrupt meta, or an encrypted SST with no key out-of-band); \
820                     skipping the block walk — use verify_block_checksums on a live \
821                     tree for ECC-aware verification",
822                ),
823            });
824            return report;
825        }
826        // Real file-open / SFA-trailer failure — preserve the underlying error
827        // rather than collapsing it into the undeterminable message above.
828        Err(error) => {
829            report.errors.push(BlockVerifyError::SstFileUnreadable {
830                table_id: 0,
831                path: path.to_path_buf(),
832                error: error.into(),
833            });
834            return report;
835        }
836    };
837
838    match scan_sst_blocks(fs, path, 0, 0, ecc, ecc_unrecognized) {
839        Ok(per_file) => {
840            report.blocks_scanned = per_file.blocks_scanned;
841            report.errors = per_file.errors;
842        }
843        Err(error) => {
844            report.errors.push(BlockVerifyError::SstFileUnreadable {
845                table_id: 0,
846                path: path.to_path_buf(),
847                error,
848            });
849        }
850    }
851
852    report
853}
854
855/// Per-SST ECC state as seen by the out-of-band scrub.
856#[cfg(feature = "std")]
857enum ScrubEcc {
858    /// ECC off — no parity trailer to skip.
859    Off,
860    /// A recognized + applicable scheme — size + verify the trailer with it.
861    Scheme(crate::table::block::EccParams),
862    /// An ECC scheme this build can't apply (unimplemented / unknown /
863    /// non-canonical). The trailer length isn't derivable, so the walk must
864    /// be skipped with a warning.
865    Unrecognized,
866}
867
868/// Best-effort read of the per-SST ECC state from an SST file's meta
869/// descriptor, for the out-of-band scrub (no live `Table` to consult).
870///
871/// Returns `Ok(Some(state))` when a meta block decodes. The authoritative
872/// tail `meta` section is tried first; if its block is corrupt / undecodable
873/// the early `meta_mid` mirror (which the writer emits so one bad meta block
874/// can't lose the descriptor) is tried next. The `Ok(None)` outer means the
875/// file and SFA trailer are readable but NEITHER meta block decodes (both
876/// corrupt, or an encrypted SST whose key the out-of-band tool doesn't have) —
877/// the scheme is genuinely UNDETERMINABLE. Returns `Err` when the file can't be
878/// opened or its SFA trailer can't be parsed.
879///
880/// The caller MUST NOT treat `Ok(None)` as "ECC disabled": walking an
881/// ECC-bearing SST without skipping the parity trailers mis-aligns the block
882/// scan and reports spurious corruption, so the caller skips the walk and
883/// surfaces the indeterminacy instead.
884#[cfg(feature = "std")]
885fn read_ecc_params_out_of_band(
886    fs: &dyn crate::fs::Fs,
887    path: &std::path::Path,
888) -> std::io::Result<Option<ScrubEcc>> {
889    let mut probe = fs.open(path, &crate::fs::FsOpenOptions::new().read(true))?;
890    let sfa_reader = crate::sfa::Reader::from_reader(&mut probe)
891        .map_err(|e| std::io::Error::new(std::io::ErrorKind::InvalidData, e))?;
892    let toc = sfa_reader.toc();
893    // Tail `meta` is authoritative; `meta_mid` is the early mirror written so a
894    // single corrupt meta block doesn't lose the per-SST descriptor.
895    for name in [b"meta".as_slice(), b"meta_mid".as_slice()] {
896        let Some((pos, len)) = toc.section(name).map(|e| (e.pos(), e.len())) else {
897            continue;
898        };
899        let Ok(size) = u32::try_from(len) else {
900            continue;
901        };
902        let handle = crate::table::BlockHandle::new(crate::table::BlockOffset(pos), size);
903        // table_id is moot here: this scrub path reads unencrypted meta
904        // (encryption = None), so the AAD identity is unused.
905        if let Ok(meta) =
906            crate::table::meta::ParsedMeta::load_with_handle(probe.as_ref(), &handle, None, None)
907        {
908            let state = if meta.ecc_unrecognized {
909                ScrubEcc::Unrecognized
910            } else if let Some(params) = meta.ecc_params {
911                ScrubEcc::Scheme(params)
912            } else {
913                ScrubEcc::Off
914            };
915            return Ok(Some(state));
916        }
917    }
918    Ok(None)
919}
920
921struct PerFileScan {
922    blocks_scanned: usize,
923    errors: Vec<BlockVerifyError>,
924}
925
926/// Walks every block of one SST. Returns `Err` only on file-open or
927/// SFA trailer-parse failure (those make the whole walk impossible).
928/// Per-block AND per-section errors — corrupt block headers, mismatched
929/// data checksums, post-header data-read failures, and TOC sections we
930/// cannot seek to — all land inside `PerFileScan::errors` and never
931/// cause an early return; the walker proceeds to the next section so
932/// one bad TOC entry cannot mask corruption in the others.
933fn scan_sst_blocks(
934    fs: &dyn crate::fs::Fs,
935    path: &Path,
936    table_id: TableId,
937    max_enc_overhead: u32,
938    ecc: Option<crate::table::block::EccParams>,
939    ecc_unrecognized: bool,
940) -> io::Result<PerFileScan> {
941    use io::BufReader;
942    #[cfg(not(feature = "std"))]
943    use io::{Seek, SeekFrom};
944    #[cfg(feature = "std")]
945    use std::io::{Seek, SeekFrom};
946
947    let mut file = fs.open(path, &crate::fs::FsOpenOptions::new().read(true))?;
948
949    // The SFA trailer + TOC live at the tail of the file.
950    // crate::sfa::Reader::from_reader leaves the cursor at an undefined
951    // offset; each per-section walk below explicitly seeks to the
952    // section's `pos()` first so the unknown post-trailer position
953    // doesn't matter.
954    // Capture the sfa error's Debug form in the message. crate::io::Error is
955    // message-only (no source chain) so it stays portable on no_std; the `{:?}`
956    // repr keeps the original variant (InvalidHeader / InvalidVersion /
957    // ChecksumMismatch / underlying Io) visible for downstream diagnostics, just
958    // as a string rather than a downcastable `Error::source()`.
959    let sfa_reader = crate::sfa::Reader::from_reader(&mut file)
960        .map_err(|e| io::Error::new(io::ErrorKind::InvalidData, alloc::format!("{e:?}")))?;
961    let toc = sfa_reader.toc();
962    // SFA TOC layout for an SST. The writer opens the file and
963    // immediately calls `crate::sfa::Writer::start("data")`, so the first
964    // TOC entry is named (not unnamed) and covers the data-block
965    // region. Other named sections, in writer order:
966    //
967    //   - `data`              : block-format (data blocks)
968    //   - `index`             : block-format (partitioned index leaf
969    //                           blocks; absent for full-index tables,
970    //                           emitted before `tli` by
971    //                           `PartitionedIndexWriter::finish`)
972    //   - `tli`               : block-format (top-level index, both
973    //                           full and partitioned variants)
974    //   - `filter`            : block-format (filter blocks)
975    //   - `filter_tli`        : block-format (top-level filter for
976    //                           partitioned filters; absent for full
977    //                           filters, emitted after `filter` by
978    //                           `PartitionedFilterWriter::finish`)
979    //   - `range_tombstones`  : block-format (optional)
980    //   - `meta_mid`          : block-format (early mirror of `meta`)
981    //   - `linked_blob_files` : RAW length-prefixed list of u64s
982    //   - `table_version`     : RAW single byte
983    //   - `meta_separator`    : RAW 4 KiB zero padding
984    //   - `tli_tail`          : block-format (tail mirror of `tli`)
985    //   - `meta`              : block-format (metadata, authoritative)
986    //
987    // Block-format sections are walked block-by-block (each block
988    // prefixed with the standard `Header`). Raw-format sections are
989    // skipped — their integrity is covered by the SFA-trailer
990    // checksum verified at table-open time. New section names default
991    // to "walk" (must be added to `RAW_FORMAT_SECTIONS` if they're
992    // raw), so a forgotten-to-handle section fails loud rather than
993    // silently passing a corruption.
994
995    let mut reader = BufReader::with_capacity(64 * 1024, file);
996    let mut blocks_scanned: usize = 0;
997    let mut errors: Vec<BlockVerifyError> = Vec::new();
998    // One reusable data buffer across the whole SST — sized up via
999    // `resize` per block instead of a fresh `vec![0u8; N]` allocation
1000    // each iteration. On large trees this turns thousands of malloc
1001    // calls into a single growing allocation that settles at the
1002    // largest block size seen.
1003    let mut data_buf: Vec<u8> = Vec::new();
1004
1005    for entry in toc.iter() {
1006        if RAW_FORMAT_SECTIONS.contains(&entry.name()) {
1007            continue;
1008        }
1009        let start = entry.pos();
1010        // `checked_add` (not `saturating_add`) so a corrupted or
1011        // forged TOC length cannot silently collapse to `u64::MAX`
1012        // and let the walk treat the whole address space as one
1013        // section. On overflow we surface the section as a
1014        // file-level `TocCorrupted` and skip walking it — the other
1015        // (still-walkable) sections of the same SST are honoured.
1016        // `TocCorrupted` rather than `HeaderCorrupted` because the
1017        // failure is at the section-catalogue layer, not inside any
1018        // individual block.
1019        let Some(end) = start.checked_add(entry.len()) else {
1020            errors.push(BlockVerifyError::TocCorrupted {
1021                table_id,
1022                path: path.to_path_buf(),
1023                section_name: entry.name().to_vec(),
1024                section_offset: start,
1025                reason: format!(
1026                    "section length {} overflows u64 when added to start offset {start}",
1027                    entry.len(),
1028                ),
1029            });
1030            continue;
1031        };
1032        // Mid-walk seek failure: don't propagate as a file-level Err
1033        // (that would discard everything already scanned and report
1034        // the whole SST as unreadable, which contradicts the
1035        // function's contract). Surface as a `TocCorrupted` for this
1036        // section and skip walking it; subsequent sections still run.
1037        // Again `TocCorrupted` (not `HeaderCorrupted`): we never even
1038        // reached a block to decode its header.
1039        if let Err(e) = reader.seek(SeekFrom::Start(start)) {
1040            errors.push(BlockVerifyError::TocCorrupted {
1041                table_id,
1042                path: path.to_path_buf(),
1043                section_name: entry.name().to_vec(),
1044                section_offset: start,
1045                reason: format!("seek to section start failed: {e}"),
1046            });
1047            continue;
1048        }
1049        let mut ctx = WalkCtx {
1050            reader: &mut reader,
1051            table_id,
1052            path,
1053            data_buf: &mut data_buf,
1054            blocks_scanned: &mut blocks_scanned,
1055            errors: &mut errors,
1056            max_data_length: block_data_length_cap(max_enc_overhead),
1057            ecc,
1058            ecc_unrecognized,
1059        };
1060        walk_block_region(&mut ctx, start, end);
1061    }
1062
1063    Ok(PerFileScan {
1064        blocks_scanned,
1065        errors,
1066    })
1067}
1068
1069/// SFA TOC section names whose payload is NOT a sequence of `Block`s
1070/// (i.e. NOT prefixed with the standard `Header`). The scrub skips
1071/// these sections — their integrity is covered by the SFA-trailer
1072/// checksum verified at table-open time. Every other section
1073/// (`data` / `tli` / `tli_tail` / `index` / `filter_tli` / `filter` /
1074/// `range_tombstones` / `meta` / `meta_mid`) is a `Header`-prefixed
1075/// block run and gets walked. See `scan_sst_blocks` for the full
1076/// section catalogue and the writer-side source of truth.
1077///
1078/// `meta_separator` is the 4 KiB zero-padding section the writer
1079/// emits between the MID and TAIL meta blocks so a single bad
1080/// filesystem sector cannot take out both copies — it carries no
1081/// blocks and must be skipped here, otherwise the walker would try
1082/// to decode zeros as a `Header` and report a spurious
1083/// `HeaderCorrupted` on every clean SST.
1084const RAW_FORMAT_SECTIONS: &[&[u8]] = &[b"linked_blob_files", b"table_version", b"meta_separator"];
1085
1086/// Plaintext upper bound on a single block's on-disk data segment
1087/// length, mirroring `table::block::MAX_DECOMPRESSION_SIZE` (256 MiB).
1088/// Encrypted blocks legitimately exceed this by up to the AEAD
1089/// provider's `max_overhead()`; see `block_data_length_cap` for the
1090/// effective per-walk cap that adds that overhead in.
1091const MAX_BLOCK_DATA_LENGTH: u64 = 256 * 1024 * 1024;
1092
1093/// Effective `data_length` cap for one scan, mirroring the size
1094/// validation in `Block::from_file`: plaintext cap + the table's AEAD
1095/// `max_overhead()` (0 when encryption is disabled). A value above
1096/// this is treated as `HeaderCorrupted` regardless of TOC bounds,
1097/// defending against DoS-by-allocation if both the block header and
1098/// the enclosing TOC entry are simultaneously corrupted / forged.
1099fn block_data_length_cap(max_enc_overhead: u32) -> u64 {
1100    MAX_BLOCK_DATA_LENGTH + u64::from(max_enc_overhead)
1101}
1102
1103/// Walks the contiguous block range `[start_offset, end_offset)`,
1104/// decoding each block's header (which validates the header's own
1105/// XXH3) and then re-hashing the data segment against
1106/// `header.checksum`. Stops at the first un-parseable header inside
1107/// the range — that block is reported as `HeaderCorrupted` and the
1108/// rest of the range is skipped because subsequent offsets become
1109/// unrecoverable without a valid length field.
1110/// Mutable cursor + scratch state threaded through `walk_block_region`.
1111/// Bundles the per-walk accumulators (file cursor, reused data
1112/// buffer, counters, error sink) into one borrow so the function
1113/// signature stays under clippy's argument-count cap.
1114struct WalkCtx<'a> {
1115    reader: &'a mut io::BufReader<Box<dyn crate::fs::FsFile>>,
1116    table_id: TableId,
1117    path: &'a Path,
1118    data_buf: &'a mut Vec<u8>,
1119    blocks_scanned: &'a mut usize,
1120    errors: &'a mut Vec<BlockVerifyError>,
1121    /// Effective `data_length` cap (plaintext limit + AEAD overhead).
1122    /// Matches the bound `Block::from_file` applies on the read path,
1123    /// so the scrub does not false-flag legitimate encrypted blocks
1124    /// near the 256 MiB plaintext limit as `HeaderCorrupted`.
1125    max_data_length: u64,
1126    /// Per-SST Page-ECC shard layout. SST blocks (`Data` / `Index` / `Filter` /
1127    /// `RangeTombstone`) omit the `block_flags` byte, so their parity-trailer
1128    /// presence AND shard layout are NOT derivable from the header — both come
1129    /// from this table-wide descriptor scheme. When `Some`, each such block
1130    /// carries `expected_parity_len(data_length, scheme)` parity bytes after
1131    /// the payload that the walk must skip (sized by the scheme) to stay
1132    /// aligned. Meta / Manifest / `ManifestFooter` blocks keep the byte and
1133    /// self-describe parity via their `ECC_PARITY` bit, sized with the fixed
1134    /// RS(4,2) layout the writer uses for them, regardless of this field.
1135    ecc: Option<crate::table::block::EccParams>,
1136    /// `true` when the table's ECC descriptor decodes to a scheme this build
1137    /// can't apply. The trailer length of its SST blocks (`Data` / `Index` /
1138    /// `Filter` / `RangeTombstone`) isn't derivable, so those sections are
1139    /// skipped (the caller warns once). Self-describing sections (`meta` /
1140    /// `meta_mid`) still size parity from `block_flags` and ARE walked.
1141    ecc_unrecognized: bool,
1142}
1143
1144fn walk_block_region(ctx: &mut WalkCtx<'_>, start_offset: u64, end_offset: u64) {
1145    #[cfg(not(feature = "std"))]
1146    use io::Read;
1147    #[cfg(feature = "std")]
1148    use std::io::Read;
1149
1150    let mut offset = start_offset;
1151
1152    while offset < end_offset {
1153        // Confine reads to the declared section before touching
1154        // Header::decode_from. Without this pre-check, a TOC entry
1155        // whose `len` puts `end_offset` inside the first block's
1156        // header region would let `decode_from` consume up to
1157        // `header_len` bytes — reading past the section boundary
1158        // into the next section's payload, where random bytes might
1159        // happen to parse as a "valid" header and silently corrupt
1160        // the walk. Treat the under-sized tail as `HeaderCorrupted`
1161        // and stop this section's walk; subsequent sections still
1162        // run because `walk_block_region` returns rather than
1163        // bubbling the error up.
1164        let remaining_in_section = end_offset - offset;
1165        // Lower bound: the header is at least MIN_LEN (the exact length, with
1166        // or without the block_flags byte, is known only after decode).
1167        if remaining_in_section < Header::MIN_LEN as u64 {
1168            ctx.errors.push(BlockVerifyError::HeaderCorrupted {
1169                table_id: ctx.table_id,
1170                path: ctx.path.to_path_buf(),
1171                offset,
1172                reason: format!(
1173                    "section has only {remaining_in_section} bytes left at this offset, \
1174                     less than Header::MIN_LEN = {}",
1175                    Header::MIN_LEN,
1176                ),
1177            });
1178            return;
1179        }
1180        let header = match Header::decode_from(ctx.reader) {
1181            Ok(h) => h,
1182            Err(e) => {
1183                ctx.errors.push(BlockVerifyError::HeaderCorrupted {
1184                    table_id: ctx.table_id,
1185                    path: ctx.path.to_path_buf(),
1186                    offset,
1187                    reason: format!("{e:?}"),
1188                });
1189                return;
1190            }
1191        };
1192
1193        // Unrecognized-ECC table: SST blocks (no `block_flags` byte) carry a
1194        // parity trailer whose length we can't derive without the descriptor
1195        // scheme, so this section can't be walked — stop here (the caller has
1196        // already warned). Self-describing blocks (`block_flags` present) size
1197        // parity from their `ECC_PARITY` bit, so those sections still walk.
1198        // Checked before the scanned-count increment so skipped blocks aren't
1199        // tallied. Sections are homogeneous in block type, so the first block
1200        // decides the whole section.
1201        if ctx.ecc_unrecognized && !Header::has_block_flags(header.block_type) {
1202            return;
1203        }
1204
1205        // Count the block as "header-read" immediately on successful
1206        // decode — matches the BlockVerifyReport.blocks_scanned docs
1207        // ("includes blocks where the data checksum subsequently
1208        // failed"). Without this early increment, blocks that emit
1209        // DataReadError / data-length-bounds HeaderCorrupted would
1210        // be silently uncounted, contradicting the documented
1211        // semantics.
1212        // Block counter; a tree cannot hold 2^64 blocks, so a plain add cannot
1213        // overflow.
1214        *ctx.blocks_scanned += 1;
1215
1216        // Actual header length for this block (variable: SST blocks omit the
1217        // block_flags byte). Used for the section-bounds math and the offset
1218        // advance so the walk tracks what `decode_from` actually consumed.
1219        let header_len = Header::header_len(header.block_type) as u64;
1220
1221        // Page-ECC parity trailer that follows the payload on disk. Presence
1222        // depends on the block type: Meta / Manifest / ManifestFooter keep the
1223        // block_flags byte and self-describe via the ECC_PARITY bit; SST blocks
1224        // omit the byte, so parity presence is the per-SST `page_ecc` flag. The
1225        // trailer length is derived from data_length (never stored). The walk
1226        // must skip these bytes — otherwise the next iteration would read parity
1227        // as the following block's header and mis-align the whole section.
1228        // Parity-trailer scheme to skip for this block. Self-describing blocks
1229        // (Meta / Manifest / `ManifestFooter`) carry the `block_flags` byte and
1230        // are written with the fixed RS(4,2) layout; SST blocks size their
1231        // trailer from the per-SST descriptor scheme threaded in via `ctx.ecc`.
1232        let block_ecc = if Header::has_block_flags(header.block_type) {
1233            (header.block_flags & crate::table::block::header::block_flags::ECC_PARITY != 0)
1234                .then_some(crate::table::block::EccParams::RS_4_2)
1235        } else {
1236            ctx.ecc
1237        };
1238        let parity_len = block_ecc.map_or(0, |scheme| {
1239            u64::from(crate::table::block::expected_parity_len(
1240                header.data_length,
1241                scheme,
1242            ))
1243        });
1244
1245        // Validate data_length against TWO bounds before allocating
1246        // / reading:
1247        //
1248        // 1. Hard cap (MAX_BLOCK_DATA_LENGTH = 256 MiB, mirroring
1249        //    table::block::MAX_DECOMPRESSION_SIZE). Catches the case
1250        //    where BOTH the block header AND the enclosing TOC entry
1251        //    are simultaneously corrupted/forged so that `remaining`
1252        //    becomes arbitrarily large. Without this, a forged TOC
1253        //    entry with len=u64::MAX could let the section-bounds
1254        //    check pass and trigger a multi-GB Vec::resize.
1255        //
1256        // 2. Remaining bytes in this TOC section. Header::decode_from
1257        //    already verified the header's own XXH3, so a data_length
1258        //    that overruns the section bounds is either bit-flip
1259        //    corruption that happened to keep the header digest
1260        //    valid (rare but possible), or fuzz input. Honouring it
1261        //    would read past `end_offset` into the next section.
1262        //
1263        // Both bounds are reported as HeaderCorrupted — the header
1264        // was technically parseable but its length field is invalid.
1265        let data_length_u64 = u64::from(header.data_length);
1266        if data_length_u64 > ctx.max_data_length {
1267            ctx.errors.push(BlockVerifyError::HeaderCorrupted {
1268                table_id: ctx.table_id,
1269                path: ctx.path.to_path_buf(),
1270                offset,
1271                reason: format!(
1272                    "header data_length {data_length_u64} exceeds hard cap {}",
1273                    ctx.max_data_length,
1274                ),
1275            });
1276            return;
1277        }
1278        // A header whose own bytes cross the section boundary is corrupt and must
1279        // be rejected here: clamping `remaining` to zero would let a header with a
1280        // zero-length declared payload slip past the `>` check below even though
1281        // the header itself ran past the section end. Reuse the plain
1282        // `remaining_in_section` (the loop invariant `offset < end_offset` keeps
1283        // it non-negative) rather than recomputing it.
1284        if header_len > remaining_in_section {
1285            ctx.errors.push(BlockVerifyError::HeaderCorrupted {
1286                table_id: ctx.table_id,
1287                path: ctx.path.to_path_buf(),
1288                offset,
1289                reason: format!(
1290                    "block header ({header_len} bytes) extends past the section end \
1291                     ({remaining_in_section} bytes remain)",
1292                ),
1293            });
1294            return;
1295        }
1296        let remaining = remaining_in_section - header_len;
1297        // `data_length_u64` is already capped at `ctx.max_data_length` (checked
1298        // above) and `parity_len` is derived from it, so the sum is bounded well
1299        // within u64 — a plain add cannot overflow.
1300        let on_disk_payload = data_length_u64 + parity_len;
1301        if on_disk_payload > remaining {
1302            ctx.errors.push(BlockVerifyError::HeaderCorrupted {
1303                table_id: ctx.table_id,
1304                path: ctx.path.to_path_buf(),
1305                offset,
1306                reason: format!(
1307                    "header data_length {data_length_u64} + parity {parity_len} exceeds \
1308                     remaining section bytes {remaining}",
1309                ),
1310            });
1311            return;
1312        }
1313
1314        let data_length = header.data_length as usize;
1315        ctx.data_buf.resize(data_length, 0);
1316        // `as_mut_slice` returns the whole `Vec` (exactly `data_length`
1317        // bytes after the resize above) — full-slice access dodges
1318        // the crate-wide `#[deny(clippy::indexing_slicing)]`.
1319        if let Err(e) = ctx.reader.read_exact(ctx.data_buf.as_mut_slice()) {
1320            // Header was clean (XXH3 matched) but the data segment
1321            // that should follow it could not be read in full —
1322            // truncated SST, unexpected EOF, transient I/O.
1323            // Semantically distinct from HeaderCorrupted; reported
1324            // under its own variant so callers pattern-matching on
1325            // the error kind aren't surprised to find post-header
1326            // I/O failures bucketed with header-parse failures.
1327            ctx.errors.push(BlockVerifyError::DataReadError {
1328                table_id: ctx.table_id,
1329                path: ctx.path.to_path_buf(),
1330                offset,
1331                data_length: header.data_length,
1332                error: e.into(),
1333            });
1334            return;
1335        }
1336
1337        let computed = Checksum::from_raw(crate::hash::hash128(ctx.data_buf));
1338        if computed != header.checksum {
1339            ctx.errors.push(BlockVerifyError::DataCorrupted {
1340                table_id: ctx.table_id,
1341                path: ctx.path.to_path_buf(),
1342                offset,
1343                data_length: header.data_length,
1344                expected: header.checksum,
1345                got: computed,
1346            });
1347        }
1348
1349        // Consume the parity trailer (if any) so the reader cursor lands on
1350        // the next block's header. The payload checksum above already covers
1351        // correctness; parity is only consulted for ECC recovery on the live
1352        // read path, so the scrub discards it — but it MUST still skip exactly
1353        // `parity_len` bytes or the next iteration mis-reads parity as a header.
1354        if parity_len > 0 {
1355            // Discard the parity trailer so the cursor lands on the next block's
1356            // header. `crate::io` has no `copy`/`sink`, so drain exactly
1357            // `parity_len` bytes through a small scratch buffer.
1358            let mut scratch = [0u8; 512];
1359            let mut remaining = parity_len;
1360            // A short read (EOF before `parity_len`) and an underlying read error
1361            // are the same outcome for the scrub: the trailer cannot be skipped,
1362            // so collapse both into one `Err` and report a single DataReadError.
1363            let drain: io::Result<()> = loop {
1364                if remaining == 0 {
1365                    break Ok(());
1366                }
1367                let want =
1368                    usize::try_from(remaining.min(scratch.len() as u64)).unwrap_or(scratch.len());
1369                let (head, _) = scratch.split_at_mut(want);
1370                match ctx.reader.read(head) {
1371                    Ok(0) => {
1372                        break Err(io::Error::new(
1373                            io::ErrorKind::UnexpectedEof,
1374                            alloc::format!(
1375                                "parity trailer truncated: read {} of {parity_len} bytes",
1376                                parity_len - remaining
1377                            ),
1378                        ));
1379                    }
1380                    Ok(n) => remaining -= n as u64,
1381                    Err(e) => {
1382                        // EINTR is transient: retry the read rather than aborting
1383                        // the parity skip with a spurious DataReadError (matches
1384                        // the Interrupted handling in read_exact above). Convert
1385                        // first so the kind check is uniform across std/no_std.
1386                        let e: io::Error = e.into();
1387                        if e.kind() != io::ErrorKind::Interrupted {
1388                            break Err(e);
1389                        }
1390                    }
1391                }
1392            };
1393            if let Err(error) = drain {
1394                ctx.errors.push(BlockVerifyError::DataReadError {
1395                    table_id: ctx.table_id,
1396                    path: ctx.path.to_path_buf(),
1397                    offset,
1398                    data_length: header.data_length,
1399                    error,
1400                });
1401                return;
1402            }
1403        }
1404
1405        // blocks_scanned was already incremented right after a
1406        // successful Header::decode_from above — do not double-count
1407        // here.
1408        // Advance past this block. Each term is bounded (data_length capped
1409        // above, parity derived from it, header a const) and `offset` is bounded
1410        // by the section end, so the running cursor cannot overflow u64.
1411        offset += header_len + data_length_u64 + parity_len;
1412    }
1413}
1414
1415#[cfg(test)]
1416#[expect(clippy::unwrap_used, clippy::expect_used, reason = "test assertions")]
1417mod block_verify_tests;
lsm_tree/verify.rs

lsm_tree/
verify.rs