matchy_format/
validation.rs

1//! Validation for matchy format file structure
2//!
3//! Provides validation of pattern-to-data mappings and other format-level
4//! consistency checks.
5
6use crate::{ParaglobHeader, PatternDataMapping};
7use matchy_data_format::DataValue;
8use std::collections::{HashMap, HashSet};
9use std::error::Error;
10use zerocopy::FromBytes;
11
12/// Trait for validating entry data before insertion into a database
13///
14/// Implement this trait to provide custom validation logic for entries
15/// being added to a [`DatabaseBuilder`](crate::DatabaseBuilder).
16///
17/// # Example
18///
19/// ```rust,ignore
20/// use matchy_format::{DatabaseBuilder, EntryValidator};
21/// use matchy_data_format::DataValue;
22/// use std::collections::HashMap;
23/// use std::error::Error;
24///
25/// struct RequiredFieldValidator {
26///     required_fields: Vec<String>,
27/// }
28///
29/// impl EntryValidator for RequiredFieldValidator {
30///     fn validate(
31///         &self,
32///         key: &str,
33///         data: &HashMap<String, DataValue>,
34///     ) -> Result<(), Box<dyn Error + Send + Sync>> {
35///         for field in &self.required_fields {
36///             if !data.contains_key(field) {
37///                 return Err(format!(
38///                     "Entry '{}': missing required field '{}'",
39///                     key, field
40///                 ).into());
41///             }
42///         }
43///         Ok(())
44///     }
45/// }
46///
47/// let validator = RequiredFieldValidator {
48///     required_fields: vec!["threat_level".to_string(), "source".to_string()],
49/// };
50///
51/// let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive)
52///     .with_validator(Box::new(validator));
53///
54/// // This will fail validation
55/// builder.add_entry("1.2.3.4", HashMap::new())?;
56/// ```
57pub trait EntryValidator: Send + Sync {
58    /// Validate entry data before insertion
59    ///
60    /// # Arguments
61    /// * `key` - The entry key (IP, domain, pattern, etc.)
62    /// * `data` - The data map to be associated with this entry
63    ///
64    /// # Returns
65    /// * `Ok(())` if validation passes
66    /// * `Err(...)` with a descriptive error message if validation fails
67    fn validate(
68        &self,
69        key: &str,
70        data: &HashMap<String, DataValue>,
71    ) -> Result<(), Box<dyn Error + Send + Sync>>;
72}
73
74/// Validation result for format-level checks
75#[derive(Debug, Clone)]
76pub struct FormatValidationResult {
77    /// Errors found during validation
78    pub errors: Vec<String>,
79    /// Warnings about potential issues
80    pub warnings: Vec<String>,
81    /// Validation statistics
82    pub stats: FormatStats,
83}
84
85impl FormatValidationResult {
86    /// Create a new empty validation result
87    #[must_use]
88    pub fn new() -> Self {
89        Self {
90            errors: Vec::new(),
91            warnings: Vec::new(),
92            stats: FormatStats::default(),
93        }
94    }
95
96    /// Check if validation passed (no errors)
97    #[must_use]
98    pub fn is_valid(&self) -> bool {
99        self.errors.is_empty()
100    }
101
102    /// Add an error
103    pub fn error(&mut self, msg: String) {
104        self.errors.push(msg);
105    }
106
107    /// Add a warning
108    pub fn warning(&mut self, msg: String) {
109        self.warnings.push(msg);
110    }
111}
112
113impl Default for FormatValidationResult {
114    fn default() -> Self {
115        Self::new()
116    }
117}
118
119/// Statistics from format validation
120#[derive(Debug, Clone, Default)]
121pub struct FormatStats {
122    /// Number of mappings validated
123    pub mappings_validated: usize,
124    /// Number of patterns with data
125    pub patterns_with_data: usize,
126    /// Number of duplicate mappings found
127    pub duplicate_mappings: usize,
128}
129
130/// Validate data section mapping consistency (v2+ format)
131///
132/// This function validates the pattern→data mapping table to ensure:
133/// - All pattern IDs are valid (< pattern_count)
134/// - No duplicate pattern IDs in mapping table
135/// - Data offsets and sizes are within bounds
136///
137/// # Arguments
138/// * `buffer` - Full file buffer
139/// * `header` - Parsed ParaglobHeader
140///
141/// # Returns
142/// Validation result with errors, warnings, and coverage statistics
143#[must_use]
144pub fn validate_data_mapping_consistency(
145    buffer: &[u8],
146    header: &ParaglobHeader,
147) -> FormatValidationResult {
148    let mut result = FormatValidationResult::new();
149
150    let mapping_offset = header.mapping_table_offset as usize;
151    let mapping_count = header.mapping_count as usize;
152    let data_offset = header.data_section_offset as usize;
153    let data_size = header.data_section_size as usize;
154
155    if mapping_count == 0 {
156        // No mappings is valid (not all patterns need data)
157        return result;
158    }
159
160    if mapping_offset == 0 {
161        result.warning("Mapping table offset is 0 but mapping_count > 0".to_string());
162        return result;
163    }
164
165    let mut patterns_with_data = HashSet::new();
166    let mut duplicate_mappings = 0;
167
168    for i in 0..mapping_count {
169        let entry_offset = mapping_offset + i * std::mem::size_of::<PatternDataMapping>();
170        if entry_offset + std::mem::size_of::<PatternDataMapping>() > buffer.len() {
171            result.error(format!(
172                "Mapping entry {i} at offset {entry_offset} truncated"
173            ));
174            continue;
175        }
176
177        let mapping = match PatternDataMapping::read_from_prefix(&buffer[entry_offset..]) {
178            Ok((m, _)) => m,
179            Err(_) => {
180                result.error(format!(
181                    "Failed to read PatternDataMapping at offset {entry_offset}"
182                ));
183                continue;
184            }
185        };
186
187        // Check for duplicate pattern IDs in mapping table
188        if !patterns_with_data.insert(mapping.pattern_id) {
189            duplicate_mappings += 1;
190        }
191
192        // Validate pattern ID is valid
193        if mapping.pattern_id >= header.pattern_count {
194            result.error(format!(
195                "Mapping entry {} references invalid pattern ID {} (max: {})",
196                i,
197                mapping.pattern_id,
198                header.pattern_count - 1
199            ));
200            continue;
201        }
202
203        // Validate inline data bounds if applicable
204        if header.has_inline_data() {
205            let data_ref = mapping.data_offset as usize;
206            // Check if this looks like an inline data reference
207            if data_ref >= data_offset && data_ref < data_offset + data_size {
208                let data_end = data_ref + mapping.data_size as usize;
209                if data_end > data_offset + data_size {
210                    result.error(format!(
211                        "Mapping entry {} data range [{}, {}) exceeds data section [{}, {})",
212                        i,
213                        data_ref,
214                        data_end,
215                        data_offset,
216                        data_offset + data_size
217                    ));
218                }
219            }
220        }
221
222        result.stats.mappings_validated += 1;
223    }
224
225    result.stats.patterns_with_data = patterns_with_data.len();
226    result.stats.duplicate_mappings = duplicate_mappings;
227
228    if duplicate_mappings > 0 {
229        result.warning(format!(
230            "Found {duplicate_mappings} duplicate pattern IDs in data mapping table"
231        ));
232    }
233
234    result
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    fn create_test_header(pattern_count: u32, mapping_count: u32) -> ParaglobHeader {
242        let mut header = ParaglobHeader::new();
243        header.pattern_count = pattern_count;
244        header.mapping_count = mapping_count;
245        header.mapping_table_offset = 1000; // Arbitrary offset
246        header.data_section_offset = 5000;
247        header.data_section_size = 1000;
248        header.data_flags = 0x01; // Inline data
249        header
250    }
251
252    fn encode_mapping(pattern_id: u32, data_offset: u32, data_size: u32) -> Vec<u8> {
253        let mut buf = Vec::new();
254        buf.extend_from_slice(&pattern_id.to_le_bytes());
255        buf.extend_from_slice(&data_offset.to_le_bytes());
256        buf.extend_from_slice(&data_size.to_le_bytes());
257        buf
258    }
259
260    #[test]
261    fn test_validate_no_mappings() {
262        let header = create_test_header(10, 0);
263        let buffer = vec![0u8; 6000];
264
265        let result = validate_data_mapping_consistency(&buffer, &header);
266        assert!(result.is_valid());
267        assert_eq!(result.stats.mappings_validated, 0);
268    }
269
270    #[test]
271    fn test_validate_valid_mappings() {
272        let header = create_test_header(10, 3);
273        let mut buffer = vec![0u8; 6000];
274
275        // Write three valid mappings at offset 1000
276        let mappings = vec![
277            encode_mapping(0, 5100, 50),
278            encode_mapping(1, 5200, 50),
279            encode_mapping(2, 5300, 50),
280        ];
281
282        let mut offset = 1000;
283        for mapping_bytes in mappings {
284            buffer[offset..offset + mapping_bytes.len()].copy_from_slice(&mapping_bytes);
285            offset += mapping_bytes.len();
286        }
287
288        let result = validate_data_mapping_consistency(&buffer, &header);
289        assert!(result.is_valid());
290        assert_eq!(result.stats.mappings_validated, 3);
291        assert_eq!(result.stats.patterns_with_data, 3);
292        assert_eq!(result.stats.duplicate_mappings, 0);
293    }
294
295    #[test]
296    fn test_validate_duplicate_pattern_ids() {
297        let header = create_test_header(10, 3);
298        let mut buffer = vec![0u8; 6000];
299
300        // Write mappings with duplicate pattern IDs
301        let mappings = vec![
302            encode_mapping(0, 5100, 50),
303            encode_mapping(1, 5200, 50),
304            encode_mapping(0, 5300, 50), // Duplicate!
305        ];
306
307        let mut offset = 1000;
308        for mapping_bytes in mappings {
309            buffer[offset..offset + mapping_bytes.len()].copy_from_slice(&mapping_bytes);
310            offset += mapping_bytes.len();
311        }
312
313        let result = validate_data_mapping_consistency(&buffer, &header);
314        assert!(result.is_valid()); // Duplicates are warnings, not errors
315        assert_eq!(result.warnings.len(), 1);
316        assert_eq!(result.stats.duplicate_mappings, 1);
317        assert_eq!(result.stats.patterns_with_data, 2); // Only 2 unique patterns
318    }
319
320    #[test]
321    fn test_validate_invalid_pattern_id() {
322        let header = create_test_header(10, 2);
323        let mut buffer = vec![0u8; 6000];
324
325        // Write mappings, one with invalid pattern ID
326        let mappings = vec![
327            encode_mapping(5, 5100, 50),
328            encode_mapping(99, 5200, 50), // Invalid! >= pattern_count
329        ];
330
331        let mut offset = 1000;
332        for mapping_bytes in mappings {
333            buffer[offset..offset + mapping_bytes.len()].copy_from_slice(&mapping_bytes);
334            offset += mapping_bytes.len();
335        }
336
337        let result = validate_data_mapping_consistency(&buffer, &header);
338        assert!(!result.is_valid());
339        assert_eq!(result.errors.len(), 1);
340        assert!(result.errors[0].contains("invalid pattern ID 99"));
341    }
342
343    #[test]
344    fn test_validate_data_bounds() {
345        let header = create_test_header(10, 2);
346        let mut buffer = vec![0u8; 6000];
347
348        // Write mappings with out-of-bounds data
349        let mappings = vec![
350            encode_mapping(0, 5100, 50),  // Valid
351            encode_mapping(1, 5900, 200), // Exceeds data section (5000 + 1000 = 6000)
352        ];
353
354        let mut offset = 1000;
355        for mapping_bytes in mappings {
356            buffer[offset..offset + mapping_bytes.len()].copy_from_slice(&mapping_bytes);
357            offset += mapping_bytes.len();
358        }
359
360        let result = validate_data_mapping_consistency(&buffer, &header);
361        assert!(!result.is_valid());
362        assert_eq!(result.errors.len(), 1);
363        assert!(result.errors[0].contains("exceeds data section"));
364    }
365
366    #[test]
367    fn test_validate_truncated_mapping_table() {
368        let header = create_test_header(10, 3);
369        let buffer = vec![0u8; 1020]; // Too small to hold all 3 mappings
370
371        let result = validate_data_mapping_consistency(&buffer, &header);
372        assert!(!result.is_valid());
373        assert!(result.errors.iter().any(|e| e.contains("truncated")));
374    }
375}