Skip to main content

sochdb_storage/
zero_copy_safety.rs

1// SPDX-License-Identifier: AGPL-3.0-or-later
2// SochDB - LLM-Optimized Embedded Database
3// Copyright (C) 2026 Sushanth Reddy Vanagala (https://github.com/sushanthpy)
4//
5// This program is free software: you can redistribute it and/or modify
6// it under the terms of the GNU Affero General Public License as published by
7// the Free Software Foundation, either version 3 of the License, or
8// (at your option) any later version.
9//
10// This program is distributed in the hope that it will be useful,
11// but WITHOUT ANY WARRANTY; without even the implied warranty of
12// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13// GNU Affero General Public License for more details.
14//
15// You should have received a copy of the GNU Affero General Public License
16// along with this program. If not, see <https://www.gnu.org/licenses/>.
17
18//! Zero-Copy Safety with Validation Layer (Task 5)
19//!
20//! Implements defense-in-depth validation for memory-mapped files to prevent crashes
21//! from corrupted, truncated, or tampered files.
22//!
23//! ## Safety Model
24//!
25//! Defense-in-Depth Validation:
26//!
27//! Layer 1: Pre-mmap File Validation
28//!   - Check file size: size ≥ HEADER_SIZE + MIN_ENTRIES × EDGE_SIZE + FOOTER_SIZE
29//!   - Read & verify header magic and version
30//!   - Read & verify footer checksum
31//!   - Sample validation: K = ceiling(ln(1/δ) / ε) random edges
32//!
33//! Layer 2: Bounded Access Wrappers
34//!   - Validated constructors with bounds checking
35//!   - Type-safe API prevents misuse at compile-time
36//!   - All offset dereferences bounds-checked
37//!
38//! Layer 3: Runtime Mmap Protection
39//!   - ValidatedMmap wrapper for all accesses
40//!   - Graceful handling of SIGBUS from truncated files
41//!
42//! ## Probabilistic Sampling
43//!
44//! Instead of O(N) full validation, sample K random edges:
45//!   K = ceiling(ln(1/δ) / ε)
46//! Where:
47//!   δ = false negative rate (e.g., 0.01 = 1% miss rate)
48//!   ε = corruption fraction (e.g., 0.01 = 1% bad edges)
49//!
50//! Example: δ = 0.01, ε = 0.01 → K = 461 samples
51//! Time: O(K) = O(log(1/δ) / ε) independent of file size
52
53use std::collections::HashSet;
54use std::fs::File;
55use std::io::{Read, Seek, SeekFrom};
56use std::marker::PhantomData;
57use std::ops::Range;
58use std::path::Path;
59use std::sync::Arc;
60use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
61
62use byteorder::{ByteOrder, LittleEndian};
63
64// ============================================================================
65// Constants
66// ============================================================================
67
68/// SochDB magic number for SSTable files
69pub const SOCHDB_MAGIC: u64 = 0x544F4F4E44420001; // "SOCHDB" + version 1
70
71/// Edge magic number (within each edge record)
72pub const EDGE_MAGIC: u32 = 0xED6E0001;
73
74/// Standard edge size in bytes
75pub const EDGE_SIZE: usize = 128;
76
77/// Header size (magic + version + metadata)
78pub const HEADER_SIZE: usize = 64;
79
80/// Footer size (checksum + stats + index offset)
81pub const FOOTER_SIZE: usize = 144;
82
83/// Minimum valid file size
84pub const MIN_FILE_SIZE: u64 = (HEADER_SIZE + EDGE_SIZE + FOOTER_SIZE) as u64;
85
86/// Maximum reasonable file size (10 GB)
87pub const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024 * 1024;
88
89/// Supported format versions
90pub const SUPPORTED_VERSIONS: &[u32] = &[1, 2];
91
92// ============================================================================
93// Validation Errors
94// ============================================================================
95
96/// Validation error types with detailed context
97#[derive(Debug, Clone)]
98pub enum ValidationError {
99    /// File is smaller than minimum valid size
100    FileTooSmall { actual: u64, minimum: u64 },
101    /// File is larger than maximum supported size
102    FileTooLarge { actual: u64, maximum: u64 },
103    /// Invalid magic number at file header
104    BadMagic { expected: u64, actual: u64 },
105    /// Unsupported format version
106    UnsupportedVersion { version: u32, supported: Vec<u32> },
107    /// Footer checksum does not match
108    ChecksumMismatch {
109        expected: [u8; 32],
110        actual: [u8; 32],
111    },
112    /// Edge at given index is corrupted
113    CorruptedEdge { index: usize, reason: String },
114    /// Offset points outside valid data region
115    InvalidOffset { offset: u64, max: u64 },
116    /// Length would exceed data region
117    InvalidLength { offset: u64, length: u64, max: u64 },
118    /// Alignment violation
119    AlignmentViolation {
120        offset: u64,
121        required_alignment: usize,
122    },
123    /// Access to unmapped or invalid region
124    OutOfBounds {
125        offset: usize,
126        length: usize,
127        region_size: usize,
128    },
129    /// I/O error during validation
130    IoError(String),
131    /// File was truncated after mmap
132    TruncatedFile { expected: u64, actual: u64 },
133}
134
135impl std::fmt::Display for ValidationError {
136    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
137        match self {
138            Self::FileTooSmall { actual, minimum } => {
139                write!(f, "File too small: {} bytes (minimum: {})", actual, minimum)
140            }
141            Self::FileTooLarge { actual, maximum } => {
142                write!(f, "File too large: {} bytes (maximum: {})", actual, maximum)
143            }
144            Self::BadMagic { expected, actual } => {
145                write!(f, "Bad magic: {:#x} (expected {:#x})", actual, expected)
146            }
147            Self::UnsupportedVersion { version, supported } => {
148                write!(
149                    f,
150                    "Unsupported version: {} (supported: {:?})",
151                    version, supported
152                )
153            }
154            Self::ChecksumMismatch { expected, actual } => {
155                write!(
156                    f,
157                    "Checksum mismatch: {} vs {}",
158                    hex::encode(expected),
159                    hex::encode(actual)
160                )
161            }
162            Self::CorruptedEdge { index, reason } => {
163                write!(f, "Corrupted edge at index {}: {}", index, reason)
164            }
165            Self::InvalidOffset { offset, max } => {
166                write!(f, "Invalid offset: {} (max: {})", offset, max)
167            }
168            Self::InvalidLength {
169                offset,
170                length,
171                max,
172            } => {
173                write!(
174                    f,
175                    "Invalid length: {} at offset {} (max: {})",
176                    length, offset, max
177                )
178            }
179            Self::AlignmentViolation {
180                offset,
181                required_alignment,
182            } => {
183                write!(
184                    f,
185                    "Alignment violation at {}: required {} byte alignment",
186                    offset, required_alignment
187                )
188            }
189            Self::OutOfBounds {
190                offset,
191                length,
192                region_size,
193            } => {
194                write!(
195                    f,
196                    "Out of bounds: [{}..{}] in region of size {}",
197                    offset,
198                    offset + length,
199                    region_size
200                )
201            }
202            Self::IoError(e) => write!(f, "I/O error: {}", e),
203            Self::TruncatedFile { expected, actual } => {
204                write!(
205                    f,
206                    "File truncated: expected {} bytes, got {}",
207                    expected, actual
208                )
209            }
210        }
211    }
212}
213
214impl std::error::Error for ValidationError {}
215
216impl From<std::io::Error> for ValidationError {
217    fn from(e: std::io::Error) -> Self {
218        ValidationError::IoError(e.to_string())
219    }
220}
221
222// ============================================================================
223// Validation Metrics
224// ============================================================================
225
226/// Metrics for validation operations
227#[derive(Debug, Default)]
228pub struct ValidationMetrics {
229    /// Total files validated
230    pub files_validated: AtomicU64,
231    /// Validation failures
232    pub validation_failures: AtomicU64,
233    /// Edges sampled for validation
234    pub edges_sampled: AtomicU64,
235    /// Corrupted edges detected
236    pub corrupted_edges_detected: AtomicU64,
237    /// Bounds check violations
238    pub bounds_violations: AtomicU64,
239    /// Total validation time (microseconds)
240    pub validation_time_us: AtomicU64,
241}
242
243impl ValidationMetrics {
244    pub fn new() -> Self {
245        Self::default()
246    }
247
248    pub fn record_validation(&self, success: bool, duration_us: u64) {
249        self.files_validated.fetch_add(1, Ordering::Relaxed);
250        if !success {
251            self.validation_failures.fetch_add(1, Ordering::Relaxed);
252        }
253        self.validation_time_us
254            .fetch_add(duration_us, Ordering::Relaxed);
255    }
256
257    pub fn record_sample(&self, corrupted: bool) {
258        self.edges_sampled.fetch_add(1, Ordering::Relaxed);
259        if corrupted {
260            self.corrupted_edges_detected
261                .fetch_add(1, Ordering::Relaxed);
262        }
263    }
264
265    pub fn record_bounds_violation(&self) {
266        self.bounds_violations.fetch_add(1, Ordering::Relaxed);
267    }
268}
269
270// ============================================================================
271// Layer 1: Pre-Mmap File Validation
272// ============================================================================
273
274/// Configuration for pre-mmap validation
275#[derive(Debug, Clone)]
276pub struct ValidationConfig {
277    /// Perform full file checksum validation
278    pub full_checksum: bool,
279    /// Number of random edge samples (0 = no sampling)
280    pub sample_count: usize,
281    /// Maximum acceptable file size
282    pub max_file_size: u64,
283    /// Check alignment constraints
284    pub check_alignment: bool,
285    /// Required alignment for edge data
286    pub required_alignment: usize,
287}
288
289impl Default for ValidationConfig {
290    fn default() -> Self {
291        Self {
292            full_checksum: false,
293            sample_count: 100, // Default: sample 100 edges
294            max_file_size: MAX_FILE_SIZE,
295            check_alignment: true,
296            required_alignment: 8,
297        }
298    }
299}
300
301impl ValidationConfig {
302    /// High-security config with full validation
303    pub fn high_security() -> Self {
304        Self {
305            full_checksum: true,
306            sample_count: 500,
307            max_file_size: MAX_FILE_SIZE,
308            check_alignment: true,
309            required_alignment: 8,
310        }
311    }
312
313    /// Fast validation for hot path
314    pub fn fast() -> Self {
315        Self {
316            full_checksum: false,
317            sample_count: 0,
318            max_file_size: MAX_FILE_SIZE,
319            check_alignment: false,
320            required_alignment: 1,
321        }
322    }
323
324    /// Calculate optimal sample count for given parameters
325    ///
326    /// K = ceiling(ln(1/δ) / ε)
327    ///
328    /// - delta: false negative rate (probability of missing corruption)
329    /// - epsilon: minimum corruption fraction to detect
330    pub fn optimal_sample_count(delta: f64, epsilon: f64) -> usize {
331        ((1.0 / delta).ln() / epsilon).ceil() as usize
332    }
333}
334
335/// Pre-mmap file validator
336pub struct FileValidator {
337    config: ValidationConfig,
338    metrics: Arc<ValidationMetrics>,
339}
340
341impl FileValidator {
342    pub fn new(config: ValidationConfig) -> Self {
343        Self {
344            config,
345            metrics: Arc::new(ValidationMetrics::new()),
346        }
347    }
348
349    pub fn with_metrics(config: ValidationConfig, metrics: Arc<ValidationMetrics>) -> Self {
350        Self { config, metrics }
351    }
352
353    pub fn metrics(&self) -> &Arc<ValidationMetrics> {
354        &self.metrics
355    }
356
357    /// Validate file before memory mapping
358    ///
359    /// Steps:
360    /// 1. Check file size constraints
361    /// 2. Read and verify header magic/version
362    /// 3. Read and verify footer checksum (optional)
363    /// 4. Sample random edges for corruption (optional)
364    pub fn validate_before_mmap(
365        &self,
366        path: &Path,
367    ) -> std::result::Result<FileMetadata, ValidationError> {
368        let start = std::time::Instant::now();
369
370        let result = self.validate_impl(path);
371
372        let duration_us = start.elapsed().as_micros() as u64;
373        self.metrics.record_validation(result.is_ok(), duration_us);
374
375        result
376    }
377
378    fn validate_impl(&self, path: &Path) -> std::result::Result<FileMetadata, ValidationError> {
379        let mut file = File::open(path)?;
380        let file_size = file.metadata()?.len();
381
382        // Step 1: Size constraints
383        if file_size < MIN_FILE_SIZE {
384            return Err(ValidationError::FileTooSmall {
385                actual: file_size,
386                minimum: MIN_FILE_SIZE,
387            });
388        }
389
390        if file_size > self.config.max_file_size {
391            return Err(ValidationError::FileTooLarge {
392                actual: file_size,
393                maximum: self.config.max_file_size,
394            });
395        }
396
397        // Step 2: Read and verify header
398        file.seek(SeekFrom::Start(0))?;
399        let mut header = [0u8; HEADER_SIZE];
400        file.read_exact(&mut header)?;
401
402        let magic = LittleEndian::read_u64(&header[0..8]);
403        if magic != SOCHDB_MAGIC {
404            return Err(ValidationError::BadMagic {
405                expected: SOCHDB_MAGIC,
406                actual: magic,
407            });
408        }
409
410        let version = LittleEndian::read_u32(&header[8..12]);
411        if !SUPPORTED_VERSIONS.contains(&version) {
412            return Err(ValidationError::UnsupportedVersion {
413                version,
414                supported: SUPPORTED_VERSIONS.to_vec(),
415            });
416        }
417
418        let num_edges = LittleEndian::read_u64(&header[16..24]);
419        let data_offset = HEADER_SIZE as u64;
420        let data_length = num_edges * EDGE_SIZE as u64;
421
422        // Step 3: Read and verify footer
423        file.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?;
424        let mut footer = [0u8; FOOTER_SIZE];
425        file.read_exact(&mut footer)?;
426
427        // Optional: Full checksum verification
428        if self.config.full_checksum {
429            let expected_checksum: [u8; 32] = footer[0..32].try_into().unwrap();
430            let actual_checksum =
431                self.compute_checksum(&mut file, file_size - FOOTER_SIZE as u64)?;
432
433            if expected_checksum != actual_checksum {
434                return Err(ValidationError::ChecksumMismatch {
435                    expected: expected_checksum,
436                    actual: actual_checksum,
437                });
438            }
439        }
440
441        // Step 4: Sample random edges for corruption detection
442        if self.config.sample_count > 0 && num_edges > 0 {
443            self.validate_edge_samples(&mut file, data_offset, num_edges)?;
444        }
445
446        Ok(FileMetadata {
447            file_size,
448            version,
449            num_edges,
450            data_offset,
451            data_length,
452        })
453    }
454
455    fn compute_checksum(
456        &self,
457        file: &mut File,
458        length: u64,
459    ) -> std::result::Result<[u8; 32], ValidationError> {
460        file.seek(SeekFrom::Start(0))?;
461
462        let mut hasher = blake3::Hasher::new();
463        let mut buffer = vec![0u8; 64 * 1024];
464        let mut remaining = length;
465
466        while remaining > 0 {
467            let to_read = remaining.min(buffer.len() as u64) as usize;
468            file.read_exact(&mut buffer[..to_read])?;
469            hasher.update(&buffer[..to_read]);
470            remaining -= to_read as u64;
471        }
472
473        Ok(*hasher.finalize().as_bytes())
474    }
475
476    fn validate_edge_samples(
477        &self,
478        file: &mut File,
479        data_offset: u64,
480        num_edges: u64,
481    ) -> std::result::Result<(), ValidationError> {
482        // Use a simple deterministic pseudo-random sampling based on hashing
483        // This avoids adding rand dependency while still providing good coverage
484        let sample_count = self.config.sample_count.min(num_edges as usize);
485        let mut sampled_indices = HashSet::new();
486
487        // Generate sample indices using a simple hash-based PRNG
488        let mut seed = 0x12345678u64;
489        let prime = 0x9E3779B97F4A7C15u64; // Golden ratio based prime
490
491        while sampled_indices.len() < sample_count {
492            seed = seed.wrapping_mul(prime).wrapping_add(1);
493            let idx = (seed % num_edges) as usize;
494            sampled_indices.insert(idx);
495        }
496
497        let mut edge_buffer = [0u8; EDGE_SIZE];
498
499        for idx in sampled_indices {
500            let edge_offset = data_offset + (idx as u64 * EDGE_SIZE as u64);
501            file.seek(SeekFrom::Start(edge_offset))?;
502            file.read_exact(&mut edge_buffer)?;
503
504            let corrupted = !self.validate_edge(&edge_buffer, idx);
505            self.metrics.record_sample(corrupted);
506
507            if corrupted {
508                return Err(ValidationError::CorruptedEdge {
509                    index: idx,
510                    reason: "Edge validation failed".to_string(),
511                });
512            }
513        }
514
515        Ok(())
516    }
517
518    fn validate_edge(&self, edge_bytes: &[u8; EDGE_SIZE], _index: usize) -> bool {
519        // Check edge magic number
520        let edge_magic = LittleEndian::read_u32(&edge_bytes[0..4]);
521        if edge_magic != EDGE_MAGIC {
522            return false;
523        }
524
525        // Check edge CRC (last 4 bytes)
526        let expected_crc = LittleEndian::read_u32(&edge_bytes[EDGE_SIZE - 4..]);
527        let actual_crc = crc32fast::hash(&edge_bytes[..EDGE_SIZE - 4]);
528
529        expected_crc == actual_crc
530    }
531}
532
533/// Metadata extracted during validation
534#[derive(Debug, Clone)]
535pub struct FileMetadata {
536    pub file_size: u64,
537    pub version: u32,
538    pub num_edges: u64,
539    pub data_offset: u64,
540    pub data_length: u64,
541}
542
543// ============================================================================
544// Layer 2: Bounded Access Wrappers
545// ============================================================================
546
547/// Type-safe, bounds-checked edge reference
548///
549/// Instead of raw transmute, uses validated constructor with bounds check.
550pub struct EdgeRef<'a> {
551    bytes: &'a [u8; EDGE_SIZE],
552    _marker: PhantomData<&'a ()>,
553}
554
555impl<'a> EdgeRef<'a> {
556    /// Create a new EdgeRef with bounds and validity checking
557    ///
558    /// Returns Err if:
559    /// - offset + EDGE_SIZE > data.len()
560    /// - Edge magic is invalid
561    /// - Edge CRC does not match
562    pub fn new_checked(
563        data: &'a [u8],
564        offset: usize,
565    ) -> std::result::Result<Self, ValidationError> {
566        // Bounds check
567        if offset + EDGE_SIZE > data.len() {
568            return Err(ValidationError::OutOfBounds {
569                offset,
570                length: EDGE_SIZE,
571                region_size: data.len(),
572            });
573        }
574
575        let slice = &data[offset..offset + EDGE_SIZE];
576        let bytes: &[u8; EDGE_SIZE] =
577            slice
578                .try_into()
579                .map_err(|_| ValidationError::InvalidLength {
580                    offset: offset as u64,
581                    length: EDGE_SIZE as u64,
582                    max: data.len() as u64,
583                })?;
584
585        // Verify edge magic
586        let magic = LittleEndian::read_u32(&bytes[0..4]);
587        if magic != EDGE_MAGIC {
588            return Err(ValidationError::CorruptedEdge {
589                index: offset / EDGE_SIZE,
590                reason: format!("Bad edge magic: {:#x}", magic),
591            });
592        }
593
594        Ok(Self {
595            bytes,
596            _marker: PhantomData,
597        })
598    }
599
600    /// Create EdgeRef without validation (unsafe fast path)
601    ///
602    /// # Safety
603    /// Caller must ensure:
604    /// - offset + EDGE_SIZE <= data.len()
605    /// - Edge data is valid
606    pub unsafe fn new_unchecked(data: &'a [u8], offset: usize) -> Self {
607        let bytes: &[u8; EDGE_SIZE] = unsafe {
608            data[offset..offset + EDGE_SIZE]
609                .try_into()
610                .unwrap_unchecked()
611        };
612        Self {
613            bytes,
614            _marker: PhantomData,
615        }
616    }
617
618    /// Get raw bytes
619    pub fn as_bytes(&self) -> &[u8; EDGE_SIZE] {
620        self.bytes
621    }
622
623    /// Get source vertex ID (with bounds check)
624    pub fn source_id(&self) -> u64 {
625        LittleEndian::read_u64(&self.bytes[4..12])
626    }
627
628    /// Get target vertex ID (with bounds check)
629    pub fn target_id(&self) -> u64 {
630        LittleEndian::read_u64(&self.bytes[12..20])
631    }
632
633    /// Get edge weight (with bounds check)
634    pub fn weight(&self) -> f64 {
635        LittleEndian::read_f64(&self.bytes[20..28])
636    }
637
638    /// Get edge type (with bounds check)
639    pub fn edge_type(&self) -> u32 {
640        LittleEndian::read_u32(&self.bytes[28..32])
641    }
642
643    /// Get timestamp (with bounds check)
644    pub fn timestamp(&self) -> u64 {
645        LittleEndian::read_u64(&self.bytes[32..40])
646    }
647
648    /// Get payload bytes with bounds validation
649    ///
650    /// Payload is stored at variable offset within the edge
651    pub fn payload_bytes(&self) -> std::result::Result<&'a [u8], ValidationError> {
652        let payload_offset = LittleEndian::read_u32(&self.bytes[40..44]) as usize;
653        let payload_length = LittleEndian::read_u32(&self.bytes[44..48]) as usize;
654
655        // Bounds check within edge
656        if payload_offset + payload_length > EDGE_SIZE - 4 {
657            // -4 for CRC
658            return Err(ValidationError::InvalidOffset {
659                offset: payload_offset as u64,
660                max: (EDGE_SIZE - 4) as u64,
661            });
662        }
663
664        Ok(&self.bytes[payload_offset..payload_offset + payload_length])
665    }
666
667    /// Verify edge CRC
668    pub fn verify_crc(&self) -> bool {
669        let expected_crc = LittleEndian::read_u32(&self.bytes[EDGE_SIZE - 4..]);
670        let actual_crc = crc32fast::hash(&self.bytes[..EDGE_SIZE - 4]);
671        expected_crc == actual_crc
672    }
673}
674
675// ============================================================================
676// Layer 3: ValidatedMmap Wrapper
677// ============================================================================
678
679/// Validated memory-mapped region with bounds checking
680///
681/// All accesses are bounds-checked to prevent undefined behavior
682/// from corrupted or truncated files.
683pub struct ValidatedMmap {
684    /// Underlying mmap (via memmap2 or similar)
685    data: Vec<u8>, // Using Vec for safety; in production use memmap2::Mmap
686    /// File metadata from validation
687    metadata: FileMetadata,
688    /// Whether file has been truncated
689    is_valid: AtomicBool,
690    /// Access metrics
691    metrics: Arc<ValidationMetrics>,
692}
693
694impl ValidatedMmap {
695    /// Create a new ValidatedMmap with full validation
696    pub fn open(
697        path: &Path,
698        config: ValidationConfig,
699    ) -> std::result::Result<Self, ValidationError> {
700        let validator = FileValidator::new(config);
701        let metadata = validator.validate_before_mmap(path)?;
702
703        // Read entire file (in production, use mmap)
704        let mut file = File::open(path)?;
705        let mut data = Vec::with_capacity(metadata.file_size as usize);
706        file.read_to_end(&mut data)?;
707
708        Ok(Self {
709            data,
710            metadata,
711            is_valid: AtomicBool::new(true),
712            metrics: validator.metrics,
713        })
714    }
715
716    /// Get file metadata
717    pub fn metadata(&self) -> &FileMetadata {
718        &self.metadata
719    }
720
721    /// Check if mmap is still valid
722    pub fn is_valid(&self) -> bool {
723        self.is_valid.load(Ordering::Acquire)
724    }
725
726    /// Get a validated edge reference
727    pub fn get_edge(&self, index: usize) -> std::result::Result<EdgeRef<'_>, ValidationError> {
728        if !self.is_valid() {
729            return Err(ValidationError::TruncatedFile {
730                expected: self.metadata.file_size,
731                actual: self.data.len() as u64,
732            });
733        }
734
735        if index >= self.metadata.num_edges as usize {
736            self.metrics.record_bounds_violation();
737            return Err(ValidationError::OutOfBounds {
738                offset: index * EDGE_SIZE + self.metadata.data_offset as usize,
739                length: EDGE_SIZE,
740                region_size: self.data.len(),
741            });
742        }
743
744        let offset = self.metadata.data_offset as usize + index * EDGE_SIZE;
745        EdgeRef::new_checked(&self.data, offset)
746    }
747
748    /// Get a slice of the data with bounds checking
749    pub fn slice(&self, range: Range<usize>) -> std::result::Result<&[u8], ValidationError> {
750        if !self.is_valid() {
751            return Err(ValidationError::TruncatedFile {
752                expected: self.metadata.file_size,
753                actual: self.data.len() as u64,
754            });
755        }
756
757        if range.end > self.data.len() {
758            self.metrics.record_bounds_violation();
759            return Err(ValidationError::OutOfBounds {
760                offset: range.start,
761                length: range.end - range.start,
762                region_size: self.data.len(),
763            });
764        }
765
766        Ok(&self.data[range])
767    }
768
769    /// Iterate over all edges with validation
770    pub fn iter_edges(&self) -> ValidatedEdgeIterator<'_> {
771        ValidatedEdgeIterator {
772            mmap: self,
773            current_index: 0,
774        }
775    }
776
777    /// Get number of edges
778    pub fn num_edges(&self) -> usize {
779        self.metadata.num_edges as usize
780    }
781
782    /// Mark mmap as invalid (e.g., after detecting truncation)
783    pub fn invalidate(&self) {
784        self.is_valid.store(false, Ordering::Release);
785    }
786
787    /// Verify integrity of all edges
788    pub fn verify_all(&self) -> std::result::Result<usize, ValidationError> {
789        let mut valid_count = 0;
790        for i in 0..self.metadata.num_edges as usize {
791            let edge = self.get_edge(i)?;
792            if edge.verify_crc() {
793                valid_count += 1;
794            }
795        }
796        Ok(valid_count)
797    }
798}
799
800/// Iterator over validated edges
801pub struct ValidatedEdgeIterator<'a> {
802    mmap: &'a ValidatedMmap,
803    current_index: usize,
804}
805
806impl<'a> Iterator for ValidatedEdgeIterator<'a> {
807    type Item = std::result::Result<EdgeRef<'a>, ValidationError>;
808
809    fn next(&mut self) -> Option<Self::Item> {
810        if self.current_index >= self.mmap.num_edges() {
811            return None;
812        }
813
814        let result = self.mmap.get_edge(self.current_index);
815        self.current_index += 1;
816        Some(result)
817    }
818
819    fn size_hint(&self) -> (usize, Option<usize>) {
820        let remaining = self.mmap.num_edges() - self.current_index;
821        (remaining, Some(remaining))
822    }
823}
824
825impl<'a> ExactSizeIterator for ValidatedEdgeIterator<'a> {}
826
827// ============================================================================
828// Offset Validation Helpers
829// ============================================================================
830
831/// Validates that an offset and length are within bounds
832#[inline]
833pub fn validate_offset_length(
834    offset: u64,
835    length: u64,
836    max: u64,
837) -> std::result::Result<(), ValidationError> {
838    if offset > max {
839        return Err(ValidationError::InvalidOffset { offset, max });
840    }
841    if offset + length > max {
842        return Err(ValidationError::InvalidLength {
843            offset,
844            length,
845            max,
846        });
847    }
848    Ok(())
849}
850
851/// Validates alignment of an offset
852#[inline]
853pub fn validate_alignment(
854    offset: u64,
855    alignment: usize,
856) -> std::result::Result<(), ValidationError> {
857    if !(offset as usize).is_multiple_of(alignment) {
858        return Err(ValidationError::AlignmentViolation {
859            offset,
860            required_alignment: alignment,
861        });
862    }
863    Ok(())
864}
865
866// ============================================================================
867// Tests
868// ============================================================================
869
870#[cfg(test)]
871mod tests {
872    use super::*;
873    use std::io::Write;
874    use tempfile::NamedTempFile;
875
876    fn create_valid_test_file() -> NamedTempFile {
877        let mut file = NamedTempFile::new().unwrap();
878
879        // Write header
880        let mut header = [0u8; HEADER_SIZE];
881        LittleEndian::write_u64(&mut header[0..8], SOCHDB_MAGIC);
882        LittleEndian::write_u32(&mut header[8..12], 1); // version
883        LittleEndian::write_u64(&mut header[16..24], 2); // num_edges
884        file.write_all(&header).unwrap();
885
886        // Write 2 valid edges
887        for i in 0..2u64 {
888            let mut edge = [0u8; EDGE_SIZE];
889            LittleEndian::write_u32(&mut edge[0..4], EDGE_MAGIC);
890            LittleEndian::write_u64(&mut edge[4..12], i); // source
891            LittleEndian::write_u64(&mut edge[12..20], i + 1); // target
892
893            // Compute and write CRC
894            let crc = crc32fast::hash(&edge[..EDGE_SIZE - 4]);
895            LittleEndian::write_u32(&mut edge[EDGE_SIZE - 4..], crc);
896
897            file.write_all(&edge).unwrap();
898        }
899
900        // Write footer
901        let footer = [0u8; FOOTER_SIZE];
902        file.write_all(&footer).unwrap();
903
904        file.flush().unwrap();
905        file
906    }
907
908    #[test]
909    fn test_file_too_small() {
910        let mut file = NamedTempFile::new().unwrap();
911        file.write_all(&[0u8; 100]).unwrap();
912        file.flush().unwrap();
913
914        let validator = FileValidator::new(ValidationConfig::default());
915        let result = validator.validate_before_mmap(file.path());
916
917        assert!(matches!(result, Err(ValidationError::FileTooSmall { .. })));
918    }
919
920    #[test]
921    fn test_bad_magic() {
922        let mut file = NamedTempFile::new().unwrap();
923
924        // Write header with wrong magic
925        let mut header = [0u8; HEADER_SIZE];
926        LittleEndian::write_u64(&mut header[0..8], 0xDEADBEEF);
927        file.write_all(&header).unwrap();
928
929        // Pad to minimum size
930        file.write_all(&vec![0u8; (MIN_FILE_SIZE - HEADER_SIZE as u64) as usize])
931            .unwrap();
932        file.flush().unwrap();
933
934        let validator = FileValidator::new(ValidationConfig::fast());
935        let result = validator.validate_before_mmap(file.path());
936
937        assert!(matches!(result, Err(ValidationError::BadMagic { .. })));
938    }
939
940    #[test]
941    fn test_valid_file() {
942        let file = create_valid_test_file();
943
944        let validator = FileValidator::new(ValidationConfig::fast());
945        let result = validator.validate_before_mmap(file.path());
946
947        assert!(result.is_ok());
948        let metadata = result.unwrap();
949        assert_eq!(metadata.version, 1);
950        assert_eq!(metadata.num_edges, 2);
951    }
952
953    #[test]
954    fn test_edge_ref_bounds_check() {
955        let file = create_valid_test_file();
956        let config = ValidationConfig::fast();
957        let mmap = ValidatedMmap::open(file.path(), config).unwrap();
958
959        // Valid access
960        let edge0 = mmap.get_edge(0);
961        assert!(edge0.is_ok());
962
963        // Out of bounds
964        let edge_invalid = mmap.get_edge(100);
965        assert!(matches!(
966            edge_invalid,
967            Err(ValidationError::OutOfBounds { .. })
968        ));
969    }
970
971    #[test]
972    fn test_edge_ref_crc_verification() {
973        let file = create_valid_test_file();
974        let config = ValidationConfig::fast();
975        let mmap = ValidatedMmap::open(file.path(), config).unwrap();
976
977        let edge = mmap.get_edge(0).unwrap();
978        assert!(edge.verify_crc());
979    }
980
981    #[test]
982    fn test_validated_iterator() {
983        let file = create_valid_test_file();
984        let config = ValidationConfig::fast();
985        let mmap = ValidatedMmap::open(file.path(), config).unwrap();
986
987        let edges: Vec<_> = mmap.iter_edges().collect();
988        assert_eq!(edges.len(), 2);
989        assert!(edges.iter().all(|e| e.is_ok()));
990    }
991
992    #[test]
993    fn test_optimal_sample_count() {
994        // K = ceiling(ln(1/δ) / ε)
995        // For δ = 0.01, ε = 0.01: K = ceiling(ln(100) / 0.01) ≈ 461
996        let k = ValidationConfig::optimal_sample_count(0.01, 0.01);
997        assert!((460..=470).contains(&k));
998    }
999}