sochdb_storage/
zero_copy_safety.rs

1// Copyright 2025 Sushanth (https://github.com/sushanthpy)
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15//! Zero-Copy Safety with Validation Layer (Task 5)
16//!
17//! Implements defense-in-depth validation for memory-mapped files to prevent crashes
18//! from corrupted, truncated, or tampered files.
19//!
20//! ## Safety Model
21//!
22//! Defense-in-Depth Validation:
23//!
24//! Layer 1: Pre-mmap File Validation
25//!   - Check file size: size ≥ HEADER_SIZE + MIN_ENTRIES × EDGE_SIZE + FOOTER_SIZE
26//!   - Read & verify header magic and version
27//!   - Read & verify footer checksum
28//!   - Sample validation: K = ceiling(ln(1/δ) / ε) random edges
29//!
30//! Layer 2: Bounded Access Wrappers
31//!   - Validated constructors with bounds checking
32//!   - Type-safe API prevents misuse at compile-time
33//!   - All offset dereferences bounds-checked
34//!
35//! Layer 3: Runtime Mmap Protection
36//!   - ValidatedMmap wrapper for all accesses
37//!   - Graceful handling of SIGBUS from truncated files
38//!
39//! ## Probabilistic Sampling
40//!
41//! Instead of O(N) full validation, sample K random edges:
42//!   K = ceiling(ln(1/δ) / ε)
43//! Where:
44//!   δ = false negative rate (e.g., 0.01 = 1% miss rate)
45//!   ε = corruption fraction (e.g., 0.01 = 1% bad edges)
46//!
47//! Example: δ = 0.01, ε = 0.01 → K = 461 samples
48//! Time: O(K) = O(log(1/δ) / ε) independent of file size
49
50use std::collections::HashSet;
51use std::fs::File;
52use std::io::{Read, Seek, SeekFrom};
53use std::marker::PhantomData;
54use std::ops::Range;
55use std::path::Path;
56use std::sync::Arc;
57use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
58
59use byteorder::{ByteOrder, LittleEndian};
60
61// ============================================================================
62// Constants
63// ============================================================================
64
65/// SochDB magic number for SSTable files
66pub const SOCHDB_MAGIC: u64 = 0x544F4F4E44420001; // "SOCHDB" + version 1
67
68/// Edge magic number (within each edge record)
69pub const EDGE_MAGIC: u32 = 0xED6E0001;
70
71/// Standard edge size in bytes
72pub const EDGE_SIZE: usize = 128;
73
74/// Header size (magic + version + metadata)
75pub const HEADER_SIZE: usize = 64;
76
77/// Footer size (checksum + stats + index offset)
78pub const FOOTER_SIZE: usize = 144;
79
80/// Minimum valid file size
81pub const MIN_FILE_SIZE: u64 = (HEADER_SIZE + EDGE_SIZE + FOOTER_SIZE) as u64;
82
83/// Maximum reasonable file size (10 GB)
84pub const MAX_FILE_SIZE: u64 = 10 * 1024 * 1024 * 1024;
85
86/// Supported format versions
87pub const SUPPORTED_VERSIONS: &[u32] = &[1, 2];
88
89// ============================================================================
90// Validation Errors
91// ============================================================================
92
93/// Validation error types with detailed context
94#[derive(Debug, Clone)]
95pub enum ValidationError {
96    /// File is smaller than minimum valid size
97    FileTooSmall { actual: u64, minimum: u64 },
98    /// File is larger than maximum supported size
99    FileTooLarge { actual: u64, maximum: u64 },
100    /// Invalid magic number at file header
101    BadMagic { expected: u64, actual: u64 },
102    /// Unsupported format version
103    UnsupportedVersion { version: u32, supported: Vec<u32> },
104    /// Footer checksum does not match
105    ChecksumMismatch {
106        expected: [u8; 32],
107        actual: [u8; 32],
108    },
109    /// Edge at given index is corrupted
110    CorruptedEdge { index: usize, reason: String },
111    /// Offset points outside valid data region
112    InvalidOffset { offset: u64, max: u64 },
113    /// Length would exceed data region
114    InvalidLength { offset: u64, length: u64, max: u64 },
115    /// Alignment violation
116    AlignmentViolation {
117        offset: u64,
118        required_alignment: usize,
119    },
120    /// Access to unmapped or invalid region
121    OutOfBounds {
122        offset: usize,
123        length: usize,
124        region_size: usize,
125    },
126    /// I/O error during validation
127    IoError(String),
128    /// File was truncated after mmap
129    TruncatedFile { expected: u64, actual: u64 },
130}
131
132impl std::fmt::Display for ValidationError {
133    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
134        match self {
135            Self::FileTooSmall { actual, minimum } => {
136                write!(f, "File too small: {} bytes (minimum: {})", actual, minimum)
137            }
138            Self::FileTooLarge { actual, maximum } => {
139                write!(f, "File too large: {} bytes (maximum: {})", actual, maximum)
140            }
141            Self::BadMagic { expected, actual } => {
142                write!(f, "Bad magic: {:#x} (expected {:#x})", actual, expected)
143            }
144            Self::UnsupportedVersion { version, supported } => {
145                write!(
146                    f,
147                    "Unsupported version: {} (supported: {:?})",
148                    version, supported
149                )
150            }
151            Self::ChecksumMismatch { expected, actual } => {
152                write!(
153                    f,
154                    "Checksum mismatch: {} vs {}",
155                    hex::encode(expected),
156                    hex::encode(actual)
157                )
158            }
159            Self::CorruptedEdge { index, reason } => {
160                write!(f, "Corrupted edge at index {}: {}", index, reason)
161            }
162            Self::InvalidOffset { offset, max } => {
163                write!(f, "Invalid offset: {} (max: {})", offset, max)
164            }
165            Self::InvalidLength {
166                offset,
167                length,
168                max,
169            } => {
170                write!(
171                    f,
172                    "Invalid length: {} at offset {} (max: {})",
173                    length, offset, max
174                )
175            }
176            Self::AlignmentViolation {
177                offset,
178                required_alignment,
179            } => {
180                write!(
181                    f,
182                    "Alignment violation at {}: required {} byte alignment",
183                    offset, required_alignment
184                )
185            }
186            Self::OutOfBounds {
187                offset,
188                length,
189                region_size,
190            } => {
191                write!(
192                    f,
193                    "Out of bounds: [{}..{}] in region of size {}",
194                    offset,
195                    offset + length,
196                    region_size
197                )
198            }
199            Self::IoError(e) => write!(f, "I/O error: {}", e),
200            Self::TruncatedFile { expected, actual } => {
201                write!(
202                    f,
203                    "File truncated: expected {} bytes, got {}",
204                    expected, actual
205                )
206            }
207        }
208    }
209}
210
211impl std::error::Error for ValidationError {}
212
213impl From<std::io::Error> for ValidationError {
214    fn from(e: std::io::Error) -> Self {
215        ValidationError::IoError(e.to_string())
216    }
217}
218
219// ============================================================================
220// Validation Metrics
221// ============================================================================
222
223/// Metrics for validation operations
224#[derive(Debug, Default)]
225pub struct ValidationMetrics {
226    /// Total files validated
227    pub files_validated: AtomicU64,
228    /// Validation failures
229    pub validation_failures: AtomicU64,
230    /// Edges sampled for validation
231    pub edges_sampled: AtomicU64,
232    /// Corrupted edges detected
233    pub corrupted_edges_detected: AtomicU64,
234    /// Bounds check violations
235    pub bounds_violations: AtomicU64,
236    /// Total validation time (microseconds)
237    pub validation_time_us: AtomicU64,
238}
239
240impl ValidationMetrics {
241    pub fn new() -> Self {
242        Self::default()
243    }
244
245    pub fn record_validation(&self, success: bool, duration_us: u64) {
246        self.files_validated.fetch_add(1, Ordering::Relaxed);
247        if !success {
248            self.validation_failures.fetch_add(1, Ordering::Relaxed);
249        }
250        self.validation_time_us
251            .fetch_add(duration_us, Ordering::Relaxed);
252    }
253
254    pub fn record_sample(&self, corrupted: bool) {
255        self.edges_sampled.fetch_add(1, Ordering::Relaxed);
256        if corrupted {
257            self.corrupted_edges_detected
258                .fetch_add(1, Ordering::Relaxed);
259        }
260    }
261
262    pub fn record_bounds_violation(&self) {
263        self.bounds_violations.fetch_add(1, Ordering::Relaxed);
264    }
265}
266
267// ============================================================================
268// Layer 1: Pre-Mmap File Validation
269// ============================================================================
270
271/// Configuration for pre-mmap validation
272#[derive(Debug, Clone)]
273pub struct ValidationConfig {
274    /// Perform full file checksum validation
275    pub full_checksum: bool,
276    /// Number of random edge samples (0 = no sampling)
277    pub sample_count: usize,
278    /// Maximum acceptable file size
279    pub max_file_size: u64,
280    /// Check alignment constraints
281    pub check_alignment: bool,
282    /// Required alignment for edge data
283    pub required_alignment: usize,
284}
285
286impl Default for ValidationConfig {
287    fn default() -> Self {
288        Self {
289            full_checksum: false,
290            sample_count: 100, // Default: sample 100 edges
291            max_file_size: MAX_FILE_SIZE,
292            check_alignment: true,
293            required_alignment: 8,
294        }
295    }
296}
297
298impl ValidationConfig {
299    /// High-security config with full validation
300    pub fn high_security() -> Self {
301        Self {
302            full_checksum: true,
303            sample_count: 500,
304            max_file_size: MAX_FILE_SIZE,
305            check_alignment: true,
306            required_alignment: 8,
307        }
308    }
309
310    /// Fast validation for hot path
311    pub fn fast() -> Self {
312        Self {
313            full_checksum: false,
314            sample_count: 0,
315            max_file_size: MAX_FILE_SIZE,
316            check_alignment: false,
317            required_alignment: 1,
318        }
319    }
320
321    /// Calculate optimal sample count for given parameters
322    ///
323    /// K = ceiling(ln(1/δ) / ε)
324    ///
325    /// - delta: false negative rate (probability of missing corruption)
326    /// - epsilon: minimum corruption fraction to detect
327    pub fn optimal_sample_count(delta: f64, epsilon: f64) -> usize {
328        ((1.0 / delta).ln() / epsilon).ceil() as usize
329    }
330}
331
332/// Pre-mmap file validator
333pub struct FileValidator {
334    config: ValidationConfig,
335    metrics: Arc<ValidationMetrics>,
336}
337
338impl FileValidator {
339    pub fn new(config: ValidationConfig) -> Self {
340        Self {
341            config,
342            metrics: Arc::new(ValidationMetrics::new()),
343        }
344    }
345
346    pub fn with_metrics(config: ValidationConfig, metrics: Arc<ValidationMetrics>) -> Self {
347        Self { config, metrics }
348    }
349
350    pub fn metrics(&self) -> &Arc<ValidationMetrics> {
351        &self.metrics
352    }
353
354    /// Validate file before memory mapping
355    ///
356    /// Steps:
357    /// 1. Check file size constraints
358    /// 2. Read and verify header magic/version
359    /// 3. Read and verify footer checksum (optional)
360    /// 4. Sample random edges for corruption (optional)
361    pub fn validate_before_mmap(
362        &self,
363        path: &Path,
364    ) -> std::result::Result<FileMetadata, ValidationError> {
365        let start = std::time::Instant::now();
366
367        let result = self.validate_impl(path);
368
369        let duration_us = start.elapsed().as_micros() as u64;
370        self.metrics.record_validation(result.is_ok(), duration_us);
371
372        result
373    }
374
375    fn validate_impl(&self, path: &Path) -> std::result::Result<FileMetadata, ValidationError> {
376        let mut file = File::open(path)?;
377        let file_size = file.metadata()?.len();
378
379        // Step 1: Size constraints
380        if file_size < MIN_FILE_SIZE {
381            return Err(ValidationError::FileTooSmall {
382                actual: file_size,
383                minimum: MIN_FILE_SIZE,
384            });
385        }
386
387        if file_size > self.config.max_file_size {
388            return Err(ValidationError::FileTooLarge {
389                actual: file_size,
390                maximum: self.config.max_file_size,
391            });
392        }
393
394        // Step 2: Read and verify header
395        file.seek(SeekFrom::Start(0))?;
396        let mut header = [0u8; HEADER_SIZE];
397        file.read_exact(&mut header)?;
398
399        let magic = LittleEndian::read_u64(&header[0..8]);
400        if magic != SOCHDB_MAGIC {
401            return Err(ValidationError::BadMagic {
402                expected: SOCHDB_MAGIC,
403                actual: magic,
404            });
405        }
406
407        let version = LittleEndian::read_u32(&header[8..12]);
408        if !SUPPORTED_VERSIONS.contains(&version) {
409            return Err(ValidationError::UnsupportedVersion {
410                version,
411                supported: SUPPORTED_VERSIONS.to_vec(),
412            });
413        }
414
415        let num_edges = LittleEndian::read_u64(&header[16..24]);
416        let data_offset = HEADER_SIZE as u64;
417        let data_length = num_edges * EDGE_SIZE as u64;
418
419        // Step 3: Read and verify footer
420        file.seek(SeekFrom::End(-(FOOTER_SIZE as i64)))?;
421        let mut footer = [0u8; FOOTER_SIZE];
422        file.read_exact(&mut footer)?;
423
424        // Optional: Full checksum verification
425        if self.config.full_checksum {
426            let expected_checksum: [u8; 32] = footer[0..32].try_into().unwrap();
427            let actual_checksum =
428                self.compute_checksum(&mut file, file_size - FOOTER_SIZE as u64)?;
429
430            if expected_checksum != actual_checksum {
431                return Err(ValidationError::ChecksumMismatch {
432                    expected: expected_checksum,
433                    actual: actual_checksum,
434                });
435            }
436        }
437
438        // Step 4: Sample random edges for corruption detection
439        if self.config.sample_count > 0 && num_edges > 0 {
440            self.validate_edge_samples(&mut file, data_offset, num_edges)?;
441        }
442
443        Ok(FileMetadata {
444            file_size,
445            version,
446            num_edges,
447            data_offset,
448            data_length,
449        })
450    }
451
452    fn compute_checksum(
453        &self,
454        file: &mut File,
455        length: u64,
456    ) -> std::result::Result<[u8; 32], ValidationError> {
457        file.seek(SeekFrom::Start(0))?;
458
459        let mut hasher = blake3::Hasher::new();
460        let mut buffer = vec![0u8; 64 * 1024];
461        let mut remaining = length;
462
463        while remaining > 0 {
464            let to_read = remaining.min(buffer.len() as u64) as usize;
465            file.read_exact(&mut buffer[..to_read])?;
466            hasher.update(&buffer[..to_read]);
467            remaining -= to_read as u64;
468        }
469
470        Ok(*hasher.finalize().as_bytes())
471    }
472
473    fn validate_edge_samples(
474        &self,
475        file: &mut File,
476        data_offset: u64,
477        num_edges: u64,
478    ) -> std::result::Result<(), ValidationError> {
479        // Use a simple deterministic pseudo-random sampling based on hashing
480        // This avoids adding rand dependency while still providing good coverage
481        let sample_count = self.config.sample_count.min(num_edges as usize);
482        let mut sampled_indices = HashSet::new();
483
484        // Generate sample indices using a simple hash-based PRNG
485        let mut seed = 0x12345678u64;
486        let prime = 0x9E3779B97F4A7C15u64; // Golden ratio based prime
487
488        while sampled_indices.len() < sample_count {
489            seed = seed.wrapping_mul(prime).wrapping_add(1);
490            let idx = (seed % num_edges) as usize;
491            sampled_indices.insert(idx);
492        }
493
494        let mut edge_buffer = [0u8; EDGE_SIZE];
495
496        for idx in sampled_indices {
497            let edge_offset = data_offset + (idx as u64 * EDGE_SIZE as u64);
498            file.seek(SeekFrom::Start(edge_offset))?;
499            file.read_exact(&mut edge_buffer)?;
500
501            let corrupted = !self.validate_edge(&edge_buffer, idx);
502            self.metrics.record_sample(corrupted);
503
504            if corrupted {
505                return Err(ValidationError::CorruptedEdge {
506                    index: idx,
507                    reason: "Edge validation failed".to_string(),
508                });
509            }
510        }
511
512        Ok(())
513    }
514
515    fn validate_edge(&self, edge_bytes: &[u8; EDGE_SIZE], _index: usize) -> bool {
516        // Check edge magic number
517        let edge_magic = LittleEndian::read_u32(&edge_bytes[0..4]);
518        if edge_magic != EDGE_MAGIC {
519            return false;
520        }
521
522        // Check edge CRC (last 4 bytes)
523        let expected_crc = LittleEndian::read_u32(&edge_bytes[EDGE_SIZE - 4..]);
524        let actual_crc = crc32fast::hash(&edge_bytes[..EDGE_SIZE - 4]);
525
526        expected_crc == actual_crc
527    }
528}
529
530/// Metadata extracted during validation
531#[derive(Debug, Clone)]
532pub struct FileMetadata {
533    pub file_size: u64,
534    pub version: u32,
535    pub num_edges: u64,
536    pub data_offset: u64,
537    pub data_length: u64,
538}
539
540// ============================================================================
541// Layer 2: Bounded Access Wrappers
542// ============================================================================
543
544/// Type-safe, bounds-checked edge reference
545///
546/// Instead of raw transmute, uses validated constructor with bounds check.
547pub struct EdgeRef<'a> {
548    bytes: &'a [u8; EDGE_SIZE],
549    _marker: PhantomData<&'a ()>,
550}
551
552impl<'a> EdgeRef<'a> {
553    /// Create a new EdgeRef with bounds and validity checking
554    ///
555    /// Returns Err if:
556    /// - offset + EDGE_SIZE > data.len()
557    /// - Edge magic is invalid
558    /// - Edge CRC does not match
559    pub fn new_checked(
560        data: &'a [u8],
561        offset: usize,
562    ) -> std::result::Result<Self, ValidationError> {
563        // Bounds check
564        if offset + EDGE_SIZE > data.len() {
565            return Err(ValidationError::OutOfBounds {
566                offset,
567                length: EDGE_SIZE,
568                region_size: data.len(),
569            });
570        }
571
572        let slice = &data[offset..offset + EDGE_SIZE];
573        let bytes: &[u8; EDGE_SIZE] =
574            slice
575                .try_into()
576                .map_err(|_| ValidationError::InvalidLength {
577                    offset: offset as u64,
578                    length: EDGE_SIZE as u64,
579                    max: data.len() as u64,
580                })?;
581
582        // Verify edge magic
583        let magic = LittleEndian::read_u32(&bytes[0..4]);
584        if magic != EDGE_MAGIC {
585            return Err(ValidationError::CorruptedEdge {
586                index: offset / EDGE_SIZE,
587                reason: format!("Bad edge magic: {:#x}", magic),
588            });
589        }
590
591        Ok(Self {
592            bytes,
593            _marker: PhantomData,
594        })
595    }
596
597    /// Create EdgeRef without validation (unsafe fast path)
598    ///
599    /// # Safety
600    /// Caller must ensure:
601    /// - offset + EDGE_SIZE <= data.len()
602    /// - Edge data is valid
603    pub unsafe fn new_unchecked(data: &'a [u8], offset: usize) -> Self {
604        let bytes: &[u8; EDGE_SIZE] = unsafe {
605            data[offset..offset + EDGE_SIZE]
606                .try_into()
607                .unwrap_unchecked()
608        };
609        Self {
610            bytes,
611            _marker: PhantomData,
612        }
613    }
614
615    /// Get raw bytes
616    pub fn as_bytes(&self) -> &[u8; EDGE_SIZE] {
617        self.bytes
618    }
619
620    /// Get source vertex ID (with bounds check)
621    pub fn source_id(&self) -> u64 {
622        LittleEndian::read_u64(&self.bytes[4..12])
623    }
624
625    /// Get target vertex ID (with bounds check)
626    pub fn target_id(&self) -> u64 {
627        LittleEndian::read_u64(&self.bytes[12..20])
628    }
629
630    /// Get edge weight (with bounds check)
631    pub fn weight(&self) -> f64 {
632        LittleEndian::read_f64(&self.bytes[20..28])
633    }
634
635    /// Get edge type (with bounds check)
636    pub fn edge_type(&self) -> u32 {
637        LittleEndian::read_u32(&self.bytes[28..32])
638    }
639
640    /// Get timestamp (with bounds check)
641    pub fn timestamp(&self) -> u64 {
642        LittleEndian::read_u64(&self.bytes[32..40])
643    }
644
645    /// Get payload bytes with bounds validation
646    ///
647    /// Payload is stored at variable offset within the edge
648    pub fn payload_bytes(&self) -> std::result::Result<&'a [u8], ValidationError> {
649        let payload_offset = LittleEndian::read_u32(&self.bytes[40..44]) as usize;
650        let payload_length = LittleEndian::read_u32(&self.bytes[44..48]) as usize;
651
652        // Bounds check within edge
653        if payload_offset + payload_length > EDGE_SIZE - 4 {
654            // -4 for CRC
655            return Err(ValidationError::InvalidOffset {
656                offset: payload_offset as u64,
657                max: (EDGE_SIZE - 4) as u64,
658            });
659        }
660
661        Ok(&self.bytes[payload_offset..payload_offset + payload_length])
662    }
663
664    /// Verify edge CRC
665    pub fn verify_crc(&self) -> bool {
666        let expected_crc = LittleEndian::read_u32(&self.bytes[EDGE_SIZE - 4..]);
667        let actual_crc = crc32fast::hash(&self.bytes[..EDGE_SIZE - 4]);
668        expected_crc == actual_crc
669    }
670}
671
672// ============================================================================
673// Layer 3: ValidatedMmap Wrapper
674// ============================================================================
675
676/// Validated memory-mapped region with bounds checking
677///
678/// All accesses are bounds-checked to prevent undefined behavior
679/// from corrupted or truncated files.
680pub struct ValidatedMmap {
681    /// Underlying mmap (via memmap2 or similar)
682    data: Vec<u8>, // Using Vec for safety; in production use memmap2::Mmap
683    /// File metadata from validation
684    metadata: FileMetadata,
685    /// Whether file has been truncated
686    is_valid: AtomicBool,
687    /// Access metrics
688    metrics: Arc<ValidationMetrics>,
689}
690
691impl ValidatedMmap {
692    /// Create a new ValidatedMmap with full validation
693    pub fn open(
694        path: &Path,
695        config: ValidationConfig,
696    ) -> std::result::Result<Self, ValidationError> {
697        let validator = FileValidator::new(config);
698        let metadata = validator.validate_before_mmap(path)?;
699
700        // Read entire file (in production, use mmap)
701        let mut file = File::open(path)?;
702        let mut data = Vec::with_capacity(metadata.file_size as usize);
703        file.read_to_end(&mut data)?;
704
705        Ok(Self {
706            data,
707            metadata,
708            is_valid: AtomicBool::new(true),
709            metrics: validator.metrics,
710        })
711    }
712
713    /// Get file metadata
714    pub fn metadata(&self) -> &FileMetadata {
715        &self.metadata
716    }
717
718    /// Check if mmap is still valid
719    pub fn is_valid(&self) -> bool {
720        self.is_valid.load(Ordering::Acquire)
721    }
722
723    /// Get a validated edge reference
724    pub fn get_edge(&self, index: usize) -> std::result::Result<EdgeRef<'_>, ValidationError> {
725        if !self.is_valid() {
726            return Err(ValidationError::TruncatedFile {
727                expected: self.metadata.file_size,
728                actual: self.data.len() as u64,
729            });
730        }
731
732        if index >= self.metadata.num_edges as usize {
733            self.metrics.record_bounds_violation();
734            return Err(ValidationError::OutOfBounds {
735                offset: index * EDGE_SIZE + self.metadata.data_offset as usize,
736                length: EDGE_SIZE,
737                region_size: self.data.len(),
738            });
739        }
740
741        let offset = self.metadata.data_offset as usize + index * EDGE_SIZE;
742        EdgeRef::new_checked(&self.data, offset)
743    }
744
745    /// Get a slice of the data with bounds checking
746    pub fn slice(&self, range: Range<usize>) -> std::result::Result<&[u8], ValidationError> {
747        if !self.is_valid() {
748            return Err(ValidationError::TruncatedFile {
749                expected: self.metadata.file_size,
750                actual: self.data.len() as u64,
751            });
752        }
753
754        if range.end > self.data.len() {
755            self.metrics.record_bounds_violation();
756            return Err(ValidationError::OutOfBounds {
757                offset: range.start,
758                length: range.end - range.start,
759                region_size: self.data.len(),
760            });
761        }
762
763        Ok(&self.data[range])
764    }
765
766    /// Iterate over all edges with validation
767    pub fn iter_edges(&self) -> ValidatedEdgeIterator<'_> {
768        ValidatedEdgeIterator {
769            mmap: self,
770            current_index: 0,
771        }
772    }
773
774    /// Get number of edges
775    pub fn num_edges(&self) -> usize {
776        self.metadata.num_edges as usize
777    }
778
779    /// Mark mmap as invalid (e.g., after detecting truncation)
780    pub fn invalidate(&self) {
781        self.is_valid.store(false, Ordering::Release);
782    }
783
784    /// Verify integrity of all edges
785    pub fn verify_all(&self) -> std::result::Result<usize, ValidationError> {
786        let mut valid_count = 0;
787        for i in 0..self.metadata.num_edges as usize {
788            let edge = self.get_edge(i)?;
789            if edge.verify_crc() {
790                valid_count += 1;
791            }
792        }
793        Ok(valid_count)
794    }
795}
796
797/// Iterator over validated edges
798pub struct ValidatedEdgeIterator<'a> {
799    mmap: &'a ValidatedMmap,
800    current_index: usize,
801}
802
803impl<'a> Iterator for ValidatedEdgeIterator<'a> {
804    type Item = std::result::Result<EdgeRef<'a>, ValidationError>;
805
806    fn next(&mut self) -> Option<Self::Item> {
807        if self.current_index >= self.mmap.num_edges() {
808            return None;
809        }
810
811        let result = self.mmap.get_edge(self.current_index);
812        self.current_index += 1;
813        Some(result)
814    }
815
816    fn size_hint(&self) -> (usize, Option<usize>) {
817        let remaining = self.mmap.num_edges() - self.current_index;
818        (remaining, Some(remaining))
819    }
820}
821
822impl<'a> ExactSizeIterator for ValidatedEdgeIterator<'a> {}
823
824// ============================================================================
825// Offset Validation Helpers
826// ============================================================================
827
828/// Validates that an offset and length are within bounds
829#[inline]
830pub fn validate_offset_length(
831    offset: u64,
832    length: u64,
833    max: u64,
834) -> std::result::Result<(), ValidationError> {
835    if offset > max {
836        return Err(ValidationError::InvalidOffset { offset, max });
837    }
838    if offset + length > max {
839        return Err(ValidationError::InvalidLength {
840            offset,
841            length,
842            max,
843        });
844    }
845    Ok(())
846}
847
848/// Validates alignment of an offset
849#[inline]
850pub fn validate_alignment(
851    offset: u64,
852    alignment: usize,
853) -> std::result::Result<(), ValidationError> {
854    if !(offset as usize).is_multiple_of(alignment) {
855        return Err(ValidationError::AlignmentViolation {
856            offset,
857            required_alignment: alignment,
858        });
859    }
860    Ok(())
861}
862
863// ============================================================================
864// Tests
865// ============================================================================
866
867#[cfg(test)]
868mod tests {
869    use super::*;
870    use std::io::Write;
871    use tempfile::NamedTempFile;
872
873    fn create_valid_test_file() -> NamedTempFile {
874        let mut file = NamedTempFile::new().unwrap();
875
876        // Write header
877        let mut header = [0u8; HEADER_SIZE];
878        LittleEndian::write_u64(&mut header[0..8], SOCHDB_MAGIC);
879        LittleEndian::write_u32(&mut header[8..12], 1); // version
880        LittleEndian::write_u64(&mut header[16..24], 2); // num_edges
881        file.write_all(&header).unwrap();
882
883        // Write 2 valid edges
884        for i in 0..2u64 {
885            let mut edge = [0u8; EDGE_SIZE];
886            LittleEndian::write_u32(&mut edge[0..4], EDGE_MAGIC);
887            LittleEndian::write_u64(&mut edge[4..12], i); // source
888            LittleEndian::write_u64(&mut edge[12..20], i + 1); // target
889
890            // Compute and write CRC
891            let crc = crc32fast::hash(&edge[..EDGE_SIZE - 4]);
892            LittleEndian::write_u32(&mut edge[EDGE_SIZE - 4..], crc);
893
894            file.write_all(&edge).unwrap();
895        }
896
897        // Write footer
898        let footer = [0u8; FOOTER_SIZE];
899        file.write_all(&footer).unwrap();
900
901        file.flush().unwrap();
902        file
903    }
904
905    #[test]
906    fn test_file_too_small() {
907        let mut file = NamedTempFile::new().unwrap();
908        file.write_all(&[0u8; 100]).unwrap();
909        file.flush().unwrap();
910
911        let validator = FileValidator::new(ValidationConfig::default());
912        let result = validator.validate_before_mmap(file.path());
913
914        assert!(matches!(result, Err(ValidationError::FileTooSmall { .. })));
915    }
916
917    #[test]
918    fn test_bad_magic() {
919        let mut file = NamedTempFile::new().unwrap();
920
921        // Write header with wrong magic
922        let mut header = [0u8; HEADER_SIZE];
923        LittleEndian::write_u64(&mut header[0..8], 0xDEADBEEF);
924        file.write_all(&header).unwrap();
925
926        // Pad to minimum size
927        file.write_all(&vec![0u8; (MIN_FILE_SIZE - HEADER_SIZE as u64) as usize])
928            .unwrap();
929        file.flush().unwrap();
930
931        let validator = FileValidator::new(ValidationConfig::fast());
932        let result = validator.validate_before_mmap(file.path());
933
934        assert!(matches!(result, Err(ValidationError::BadMagic { .. })));
935    }
936
937    #[test]
938    fn test_valid_file() {
939        let file = create_valid_test_file();
940
941        let validator = FileValidator::new(ValidationConfig::fast());
942        let result = validator.validate_before_mmap(file.path());
943
944        assert!(result.is_ok());
945        let metadata = result.unwrap();
946        assert_eq!(metadata.version, 1);
947        assert_eq!(metadata.num_edges, 2);
948    }
949
950    #[test]
951    fn test_edge_ref_bounds_check() {
952        let file = create_valid_test_file();
953        let config = ValidationConfig::fast();
954        let mmap = ValidatedMmap::open(file.path(), config).unwrap();
955
956        // Valid access
957        let edge0 = mmap.get_edge(0);
958        assert!(edge0.is_ok());
959
960        // Out of bounds
961        let edge_invalid = mmap.get_edge(100);
962        assert!(matches!(
963            edge_invalid,
964            Err(ValidationError::OutOfBounds { .. })
965        ));
966    }
967
968    #[test]
969    fn test_edge_ref_crc_verification() {
970        let file = create_valid_test_file();
971        let config = ValidationConfig::fast();
972        let mmap = ValidatedMmap::open(file.path(), config).unwrap();
973
974        let edge = mmap.get_edge(0).unwrap();
975        assert!(edge.verify_crc());
976    }
977
978    #[test]
979    fn test_validated_iterator() {
980        let file = create_valid_test_file();
981        let config = ValidationConfig::fast();
982        let mmap = ValidatedMmap::open(file.path(), config).unwrap();
983
984        let edges: Vec<_> = mmap.iter_edges().collect();
985        assert_eq!(edges.len(), 2);
986        assert!(edges.iter().all(|e| e.is_ok()));
987    }
988
989    #[test]
990    fn test_optimal_sample_count() {
991        // K = ceiling(ln(1/δ) / ε)
992        // For δ = 0.01, ε = 0.01: K = ceiling(ln(100) / 0.01) ≈ 461
993        let k = ValidationConfig::optimal_sample_count(0.01, 0.01);
994        assert!((460..=470).contains(&k));
995    }
996}