coordinode-lsm-tree 5.0.0

Embedded LSM-tree storage engine: BuRR filters, zstd dictionary compression, MVCC, range tombstones, merge operators, K/V separation, AES-256-GCM at rest.
Documentation
// Copyright (c) 2025-present, fjall-rs
// This source code is licensed under both the Apache 2.0 and MIT License
// (found in the LICENSE-* files in the repository)

use super::writer::TOC_MAGIC;
use crate::sfa::{
    Result,
    checksum::Checksum,
    toc::{Toc, entry::TocEntry},
};
use byteorder::ReadBytesExt;
use std::io::{Read, Seek, SeekFrom};

struct ChecksummedReader<R: std::io::Read> {
    inner: R,
    hasher: xxhash_rust::xxh3::Xxh3Default,
}

impl<R: std::io::Read> ChecksummedReader<R> {
    pub fn new(reader: R) -> Self {
        Self {
            inner: reader,
            hasher: xxhash_rust::xxh3::Xxh3Default::new(),
        }
    }

    pub fn checksum(&self) -> Checksum {
        Checksum::from_raw(self.hasher.digest128())
    }
}

impl<R: std::io::Read> std::io::Read for ChecksummedReader<R> {
    fn read(&mut self, buf: &mut [u8]) -> std::io::Result<usize> {
        let n = self.inner.read(buf)?;

        #[expect(clippy::indexing_slicing)]
        self.hasher.update(&buf[..n]);

        Ok(n)
    }
}

pub struct TocReader;

impl TocReader {
    pub fn from_reader<R: Read + Seek>(
        reader: &mut R,
        toc_pos: u64,
        toc_len: u64,
        toc_checksum: Checksum,
    ) -> Result<Toc> {
        use byteorder::LE;

        log::trace!("Reading ToC");

        reader.seek(SeekFrom::Start(toc_pos))?;

        // Bound every byte the TOC parser pulls from the underlying
        // reader to `toc_len`, the on-disk TOC byte length recorded
        // in the (already-magic-checked) trailer. Without this gate
        // a forged `len` field inside the TOC could drive
        // `TocEntry::read_from_file` to consume arbitrary bytes
        // beyond the TOC region — wasting work / memory before the
        // checksum mismatch is caught. The take wrapper turns any
        // such over-read into UnexpectedEof, which surfaces as a
        // structural error.
        let bounded = reader.take(toc_len);
        let mut reader = ChecksummedReader::new(bounded);

        {
            let mut buf = [0u8; TOC_MAGIC.len()];
            reader.read_exact(&mut buf)?;

            if buf != TOC_MAGIC {
                log::error!("Invalid TOC magic header");
                return Err(crate::sfa::Error::InvalidHeader);
            }
        }

        let len = reader.read_u32::<LE>()?;

        // Don't pre-allocate from a length field that has not yet
        // been checksum-verified — a corrupted / forged TOC could
        // force a multi-GiB Vec allocation before the per-entry
        // reads fail. Grow the Vec amortized as entries are read;
        // the per-entry I/O cost dwarfs the realloc cost. The
        // `take(toc_len)` wrapper above bounds the total work even
        // if `len` itself is forged.
        let mut entries = Vec::new();

        for _ in 0..len {
            entries.push(TocEntry::read_from_file(&mut reader)?);
        }

        reader.checksum().check(toc_checksum)?;

        Ok(Toc(entries))
    }
}