Skip to main content

sochdb_storage/
validation.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! SSTable Validation Layer
19//!
20//! Implements defense-in-depth validation for memory-mapped files to prevent crashes
21//! from corrupted, truncated, or tampered files.
22//!
23//! ## Safety Guarantees
24//!
25//! 1. **Pre-mmap validation**: Verify file integrity before memory mapping
26//! 2. **Magic number check**: Ensure file is valid SSTable format
27//! 3. **Size validation**: Prevent reading beyond file boundaries
28//! 4. **Checksum verification**: Detect bit rot and tampering
29//!
30//! ## Formal Safety Invariant
31//!
32//! ∀p ∈ MappedPages: validate_before_mmap(file) = Ok ⟹ p.valid = true
33//!
34//! This establishes that all memory-mapped pages are valid before dereferencing.
35
36use blake3::Hasher;
37use std::fs::File;
38use std::io::{Read, Seek, SeekFrom};
39use std::path::Path;
40use sochdb_core::{Result, SochDBError};
41
42/// Minimum valid SSTable size (header + at least one edge + footer)
43/// Header (8 bytes magic) + Edge (128 bytes) + Footer (144 bytes) = 280 bytes
44pub const MIN_SSTABLE_SIZE: u64 = 280;
45
46/// SSTable magic number: "AFFv2025" in ASCII
47pub const MAGIC_NUMBER: u64 = 0x4146465632303235;
48
49/// Footer size in bytes
50pub const FOOTER_SIZE: usize = 144;
51
52/// Validation error types
53#[derive(Debug)]
54pub enum ValidationError {
55    TooSmall {
56        actual: u64,
57        minimum: u64,
58    },
59    BadMagic {
60        expected: u64,
61        actual: u64,
62    },
63    ChecksumMismatch {
64        expected: [u8; 32],
65        actual: [u8; 32],
66    },
67    IoError(std::io::Error),
68}
69
70impl std::fmt::Display for ValidationError {
71    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
72        match self {
73            ValidationError::TooSmall { actual, minimum } => {
74                write!(
75                    f,
76                    "SSTable file too small: {} bytes (minimum: {})",
77                    actual, minimum
78                )
79            }
80            ValidationError::BadMagic { expected, actual } => {
81                write!(
82                    f,
83                    "Invalid magic number: {:#x} (expected: {:#x})",
84                    actual, expected
85                )
86            }
87            ValidationError::ChecksumMismatch { expected, actual } => {
88                write!(
89                    f,
90                    "Checksum mismatch: expected {}, got {}",
91                    hex::encode(expected),
92                    hex::encode(actual)
93                )
94            }
95            ValidationError::IoError(e) => write!(f, "I/O error during validation: {}", e),
96        }
97    }
98}
99
100impl std::error::Error for ValidationError {}
101
102impl From<std::io::Error> for ValidationError {
103    fn from(e: std::io::Error) -> Self {
104        ValidationError::IoError(e)
105    }
106}
107
108/// SSTable validator for pre-mmap validation
109pub struct SSTableValidator {
110    /// Expected magic number (default: MAGIC_NUMBER)
111    pub expected_magic: u64,
112
113    /// Whether to perform full file checksum (expensive, optional)
114    pub verify_full_checksum: bool,
115
116    /// Expected file checksum (if known from metadata)
117    pub expected_checksum: Option<[u8; 32]>,
118}
119
120impl Default for SSTableValidator {
121    fn default() -> Self {
122        Self {
123            expected_magic: MAGIC_NUMBER,
124            verify_full_checksum: false,
125            expected_checksum: None,
126        }
127    }
128}
129
130impl SSTableValidator {
131    /// Create validator with full checksum verification enabled
132    pub fn with_checksum_verification(expected_checksum: [u8; 32]) -> Self {
133        Self {
134            expected_magic: MAGIC_NUMBER,
135            verify_full_checksum: true,
136            expected_checksum: Some(expected_checksum),
137        }
138    }
139
140    /// Validate SSTable file before memory mapping
141    ///
142    /// This performs comprehensive validation WITHOUT mmap to establish safety invariants:
143    /// 1. File size >= minimum (header + footer)
144    /// 2. Magic number matches expected value
145    /// 3. Footer is readable and well-formed
146    /// 4. Optional: Full file checksum (if verify_full_checksum = true)
147    ///
148    /// **Performance cost:** ~5-10ms for basic validation, ~50-100ms for full checksum
149    ///
150    /// **Safety benefit:** Prevents segfaults from corrupted/truncated files
151    pub fn validate_before_mmap(&self, file: &mut File) -> Result<()> {
152        // 1. Check file size >= minimum
153        let metadata = file.metadata()?;
154
155        let file_size = metadata.len();
156        if file_size < MIN_SSTABLE_SIZE {
157            return Err(SochDBError::Corruption(format!(
158                "SSTable file too small: {} bytes (minimum: {})",
159                file_size, MIN_SSTABLE_SIZE
160            )));
161        }
162
163        // 2. Read and verify magic number from footer (last bytes)
164        file.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?;
165
166        let mut footer_bytes = vec![0u8; FOOTER_SIZE];
167        file.read_exact(&mut footer_bytes)?;
168
169        // Extract magic number (first 8 bytes of footer)
170        let magic = u64::from_le_bytes(footer_bytes[0..8].try_into().unwrap());
171        if magic != self.expected_magic {
172            return Err(SochDBError::Corruption(format!(
173                "Invalid SSTable magic number: {:#x} (expected: {:#x})",
174                magic, self.expected_magic
175            )));
176        }
177
178        // 3. Verify footer structure integrity
179        // Extract num_entries (offset 56 in footer)
180        let num_entries = u64::from_le_bytes(footer_bytes[56..64].try_into().unwrap());
181
182        // Sanity check: num_entries should be reasonable
183        // Max entries in one SSTable: ~10M edges (each 128 bytes = 1.28GB file)
184        const MAX_REASONABLE_ENTRIES: u64 = 10_000_000;
185        if num_entries > MAX_REASONABLE_ENTRIES {
186            return Err(SochDBError::Corruption(format!(
187                "Unreasonable num_entries in footer: {} (max: {})",
188                num_entries, MAX_REASONABLE_ENTRIES
189            )));
190        }
191
192        // Verify file size matches expected content
193        // Minimum size check: footer + bloom + index + at least num_entries * 128
194        let min_expected_size = FOOTER_SIZE as u64 + num_entries * 128;
195        if file_size < min_expected_size {
196            return Err(SochDBError::Corruption(format!(
197                "File size {} too small for {} entries (expected >= {})",
198                file_size, num_entries, min_expected_size
199            )));
200        }
201
202        // 4. Optional: Verify full file checksum
203        if self.verify_full_checksum
204            && let Some(expected) = self.expected_checksum
205        {
206            let computed = self.compute_file_checksum(file)?;
207            if computed != expected {
208                return Err(SochDBError::Corruption(format!(
209                    "Checksum mismatch: expected {}, got {}",
210                    hex::encode(expected),
211                    hex::encode(computed)
212                )));
213            }
214        }
215
216        Ok(())
217    }
218
219    /// Compute BLAKE3 checksum of entire file
220    ///
221    /// **Performance:** O(file_size) - reads entire file once
222    /// For 1GB file: ~1 second on modern SSD
223    ///
224    /// **Use case:** One-time validation during SSTable open, or periodic integrity checks
225    fn compute_file_checksum(&self, file: &mut File) -> Result<[u8; 32]> {
226        // Seek to beginning
227        file.seek(SeekFrom::Start(0))?;
228
229        // Read file in chunks and hash
230        let mut hasher = Hasher::new();
231        let mut buffer = vec![0u8; 64 * 1024]; // 64KB chunks
232
233        loop {
234            let bytes_read = file.read(&mut buffer)?;
235
236            if bytes_read == 0 {
237                break;
238            }
239
240            hasher.update(&buffer[..bytes_read]);
241        }
242
243        let hash = hasher.finalize();
244        Ok(*hash.as_bytes())
245    }
246
247    /// Fast validation: only check magic number and file size
248    ///
249    /// **Performance:** O(1) - reads only footer
250    /// **Use case:** Production hot path where performance is critical
251    pub fn validate_fast(&self, file: &mut File) -> Result<()> {
252        // 1. Check file size
253        let metadata = file.metadata()?;
254
255        let file_size = metadata.len();
256        if file_size < MIN_SSTABLE_SIZE {
257            return Err(SochDBError::Corruption(format!(
258                "SSTable file too small: {} bytes",
259                file_size
260            )));
261        }
262
263        // 2. Verify magic number
264        file.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?;
265
266        let mut magic_bytes = [0u8; 8];
267        file.read_exact(&mut magic_bytes)?;
268
269        let magic = u64::from_le_bytes(magic_bytes);
270        if magic != self.expected_magic {
271            return Err(SochDBError::Corruption(format!(
272                "Invalid magic number: {:#x}",
273                magic
274            )));
275        }
276
277        Ok(())
278    }
279}
280
281/// Validate SSTable file at path (convenience function)
282///
283/// Performs fast validation (magic + size only) unless full_validation is true.
284pub fn validate_sstable_file<P: AsRef<Path>>(path: P, full_validation: bool) -> Result<()> {
285    let mut file = File::open(path.as_ref())?;
286
287    let validator = SSTableValidator::default();
288
289    if full_validation {
290        validator.validate_before_mmap(&mut file)
291    } else {
292        validator.validate_fast(&mut file)
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299    use std::io::Write;
300    use tempfile::NamedTempFile;
301
302    #[test]
303    fn test_validate_too_small() {
304        // Create file that's too small
305        let mut file = NamedTempFile::new().unwrap();
306        file.write_all(&[0u8; 100]).unwrap(); // Only 100 bytes
307        file.flush().unwrap();
308
309        let mut file = File::open(file.path()).unwrap();
310        let validator = SSTableValidator::default();
311
312        let result = validator.validate_fast(&mut file);
313        assert!(result.is_err());
314        assert!(result.unwrap_err().to_string().contains("too small"));
315    }
316
317    #[test]
318    fn test_validate_bad_magic() {
319        // Create file with wrong magic number
320        let mut file = NamedTempFile::new().unwrap();
321
322        // Write enough bytes to pass size check
323        let mut content = vec![0u8; MIN_SSTABLE_SIZE as usize];
324
325        // Write wrong magic number in footer location
326        let footer_offset = content.len() - FOOTER_SIZE;
327        let wrong_magic: u64 = 0xDEADBEEF;
328        content[footer_offset..footer_offset + 8].copy_from_slice(&wrong_magic.to_le_bytes());
329
330        file.write_all(&content).unwrap();
331        file.flush().unwrap();
332
333        let mut file = File::open(file.path()).unwrap();
334        let validator = SSTableValidator::default();
335
336        let result = validator.validate_fast(&mut file);
337        assert!(result.is_err());
338        assert!(result.unwrap_err().to_string().contains("magic"));
339    }
340
341    #[test]
342    fn test_validate_correct_file() {
343        // Create minimal valid SSTable file
344        let mut file = NamedTempFile::new().unwrap();
345
346        let mut content = vec![0u8; MIN_SSTABLE_SIZE as usize];
347
348        // Write correct magic number in footer location
349        let footer_offset = content.len() - FOOTER_SIZE;
350        content[footer_offset..footer_offset + 8].copy_from_slice(&MAGIC_NUMBER.to_le_bytes());
351
352        // Write reasonable num_entries (offset 56 in footer)
353        let num_entries: u64 = 1;
354        content[footer_offset + 56..footer_offset + 64].copy_from_slice(&num_entries.to_le_bytes());
355
356        file.write_all(&content).unwrap();
357        file.flush().unwrap();
358
359        let mut file = File::open(file.path()).unwrap();
360        let validator = SSTableValidator::default();
361
362        let result = validator.validate_fast(&mut file);
363        assert!(result.is_ok());
364    }
365}