matchy_data_format/
validation.rs

1//! Validation for MMDB data section encoding
2//!
3//! Provides validation of decoded DataValue structures to ensure:
4//! - UTF-8 validity in strings (critical for safety)
5//! - Structural integrity of data values
6//!
7//! These validations are building blocks that can be used by higher-level
8//! validators (like MMDB validation) that understand file structure.
9
10use crate::{DataDecoder, DataValue};
11
12/// Validation result for data format checks
13#[derive(Debug, Clone)]
14pub struct DataFormatValidationResult {
15    /// Errors found during validation
16    pub errors: Vec<String>,
17    /// Warnings about potential issues
18    pub warnings: Vec<String>,
19    /// Validation statistics
20    pub stats: DataFormatStats,
21}
22
23impl DataFormatValidationResult {
24    /// Create a new empty validation result
25    #[must_use]
26    pub fn new() -> Self {
27        Self {
28            errors: Vec::new(),
29            warnings: Vec::new(),
30            stats: DataFormatStats::default(),
31        }
32    }
33
34    /// Check if validation passed (no errors)
35    #[must_use]
36    pub fn is_valid(&self) -> bool {
37        self.errors.is_empty()
38    }
39
40    /// Add an error
41    pub fn error(&mut self, msg: String) {
42        self.errors.push(msg);
43    }
44
45    /// Add a warning
46    pub fn warning(&mut self, msg: String) {
47        self.warnings.push(msg);
48    }
49}
50
51impl Default for DataFormatValidationResult {
52    fn default() -> Self {
53        Self::new()
54    }
55}
56
57/// Statistics from data format validation
58#[derive(Debug, Clone, Default)]
59pub struct DataFormatStats {
60    /// Number of strings validated
61    pub strings_checked: u32,
62    /// Number of maps validated
63    pub maps_checked: u32,
64    /// Number of arrays validated
65    pub arrays_checked: u32,
66    /// Total values validated
67    pub values_checked: u32,
68}
69
70/// Validate UTF-8 in a decoded data value at the given offset
71///
72/// This function attempts to decode a value from the data section buffer
73/// and recursively validates all strings within it.
74///
75/// # Arguments
76/// * `data_section` - Raw data section bytes
77/// * `offset` - Offset within data section to decode from
78/// * `base_offset` - Base offset for pointer calculations (0 for standalone)
79///
80/// # Returns
81/// * `Ok(count)` - Number of strings validated (all valid)
82/// * `Err(msg)` - Error message if invalid UTF-8 found or decode failed
83pub fn validate_data_value_utf8(
84    data_section: &[u8],
85    offset: usize,
86    base_offset: usize,
87) -> Result<u32, String> {
88    let decoder = DataDecoder::new(data_section, base_offset);
89    let offset_u32 =
90        u32::try_from(offset).map_err(|_| format!("Offset {offset} exceeds u32::MAX"))?;
91
92    match decoder.decode(offset_u32) {
93        Ok(value) => validate_value_strings_utf8(&value),
94        Err(e) => Err(format!("Failed to decode data value: {e}")),
95    }
96}
97
98/// Recursively validate UTF-8 in all strings within a DataValue
99///
100/// This function traverses the DataValue structure and counts all strings,
101/// verifying they are valid UTF-8. Since DataValue::String already guarantees
102/// UTF-8 validity (enforced during decoding), this primarily serves as a
103/// structural validator and counter.
104///
105/// # Arguments
106/// * `value` - DataValue to validate
107///
108/// # Returns
109/// * `Ok(count)` - Number of strings found (all valid UTF-8)
110/// * `Err(msg)` - Error message if validation fails
111///
112/// # Note
113/// The DataDecoder already enforces UTF-8 validity when creating String variants,
114/// so this function won't find invalid UTF-8 in properly decoded values.
115/// It's useful for:
116/// - Counting strings in a structure
117/// - Detecting decode issues early
118/// - Providing structural validation
119pub fn validate_value_strings_utf8(value: &DataValue) -> Result<u32, String> {
120    let mut count = 0u32;
121
122    match value {
123        DataValue::String(_s) => {
124            // String is already validated UTF-8 when decoded
125            count += 1;
126        }
127        DataValue::Map(map) => {
128            for val in map.values() {
129                // Map keys are always strings, and already validated
130                count += 1;
131                // Recursively validate values
132                count += validate_value_strings_utf8(val)?;
133            }
134        }
135        DataValue::Array(arr) => {
136            for val in arr {
137                count += validate_value_strings_utf8(val)?;
138            }
139        }
140        // Other types don't contain strings
141        DataValue::Pointer(_)
142        | DataValue::Double(_)
143        | DataValue::Bytes(_)
144        | DataValue::Uint16(_)
145        | DataValue::Uint32(_)
146        | DataValue::Int32(_)
147        | DataValue::Uint64(_)
148        | DataValue::Uint128(_)
149        | DataValue::Bool(_)
150        | DataValue::Float(_)
151        | DataValue::Timestamp(_) => {}
152    }
153
154    Ok(count)
155}
156
157/// Validate data section structure by attempting to decode values
158///
159/// This is a comprehensive validation that attempts to decode all reachable
160/// data values in a data section buffer.
161///
162/// # Arguments
163/// * `data_section` - Raw data section bytes
164/// * `base_offset` - Base offset for pointer calculations
165/// * `offsets_to_check` - Specific offsets to validate (if empty, validates entire section)
166///
167/// # Returns
168/// Validation result with errors, warnings, and statistics
169#[must_use]
170pub fn validate_data_section(
171    data_section: &[u8],
172    base_offset: usize,
173    offsets_to_check: &[u32],
174) -> DataFormatValidationResult {
175    let mut result = DataFormatValidationResult::new();
176
177    if data_section.is_empty() {
178        result.warning("Data section is empty".to_string());
179        return result;
180    }
181
182    let decoder = DataDecoder::new(data_section, base_offset);
183
184    // If specific offsets provided, check those
185    if offsets_to_check.is_empty() {
186        // If no specific offsets, just validate that the section is well-formed
187        result.warning("No specific offsets to validate".to_string());
188    } else {
189        for &offset in offsets_to_check {
190            match decoder.decode(offset) {
191                Ok(value) => {
192                    result.stats.values_checked += 1;
193                    match validate_value_strings_utf8(&value) {
194                        Ok(count) => {
195                            result.stats.strings_checked += count;
196                        }
197                        Err(e) => {
198                            result.error(format!("Invalid UTF-8 at offset {offset}: {e}"));
199                        }
200                    }
201
202                    // Update type-specific stats
203                    update_stats_for_value(&value, &mut result.stats);
204                }
205                Err(e) => {
206                    result.error(format!("Failed to decode at offset {offset}: {e}"));
207                }
208            }
209        }
210    }
211
212    result
213}
214
215/// Update statistics based on value type
216fn update_stats_for_value(value: &DataValue, stats: &mut DataFormatStats) {
217    match value {
218        DataValue::Map(m) => {
219            stats.maps_checked += 1;
220            for val in m.values() {
221                update_stats_for_value(val, stats);
222            }
223        }
224        DataValue::Array(arr) => {
225            stats.arrays_checked += 1;
226            for val in arr {
227                update_stats_for_value(val, stats);
228            }
229        }
230        _ => {}
231    }
232}
233
234/// Maximum safe depth for pointer chains in MMDB data
235pub const MAX_POINTER_DEPTH: usize = 32;
236
237/// Maximum reasonable total nesting depth (arrays/maps + pointers)
238pub const MAX_TOTAL_DEPTH: usize = 64;
239
240/// Validation error types for MMDB data section pointer chains
241#[derive(Debug)]
242pub enum PointerValidationError {
243    /// Cycle detected in pointer chain
244    Cycle { offset: usize },
245    /// Depth limit exceeded
246    DepthExceeded { depth: usize },
247    /// Invalid offset encountered
248    InvalidOffset { offset: usize, reason: String },
249    /// Invalid type ID
250    InvalidType { offset: usize, type_id: u8 },
251}
252
253impl std::fmt::Display for PointerValidationError {
254    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
255        match self {
256            Self::Cycle { offset } => {
257                write!(f, "Pointer cycle detected at offset {offset}")
258            }
259            Self::DepthExceeded { depth } => {
260                write!(f, "Depth {depth} exceeds limit")
261            }
262            Self::InvalidOffset { offset, reason } => {
263                write!(f, "Invalid offset {offset} ({reason})")
264            }
265            Self::InvalidType { offset, type_id } => {
266                write!(f, "Invalid type {type_id} at offset {offset}")
267            }
268        }
269    }
270}
271
272impl std::error::Error for PointerValidationError {}
273
274/// Result of MMDB data section pointer validation
275#[derive(Debug, Clone)]
276pub struct PointerValidationResult {
277    /// Errors found
278    pub errors: Vec<String>,
279    /// Warnings found  
280    pub warnings: Vec<String>,
281    /// Statistics
282    pub stats: PointerValidationStats,
283}
284
285/// Statistics from pointer validation
286#[derive(Debug, Clone, Default)]
287pub struct PointerValidationStats {
288    /// Number of pointers checked
289    pub pointers_checked: usize,
290    /// Number of cycles detected
291    pub cycles_detected: usize,
292    /// Maximum depth found
293    pub max_depth: usize,
294    /// Invalid pointers found
295    pub invalid_pointers: usize,
296}
297
298impl PointerValidationResult {
299    /// Create new empty result
300    #[must_use]
301    pub fn new() -> Self {
302        Self {
303            errors: Vec::new(),
304            warnings: Vec::new(),
305            stats: PointerValidationStats::default(),
306        }
307    }
308
309    /// Check if validation passed
310    #[must_use]
311    pub fn is_valid(&self) -> bool {
312        self.errors.is_empty()
313    }
314}
315
316impl Default for PointerValidationResult {
317    fn default() -> Self {
318        Self::new()
319    }
320}
321
322/// Validate a data value and all pointers it contains
323///
324/// Returns the maximum depth of pointer chains encountered.
325/// Detects cycles using the visited set.
326///
327/// # Arguments
328/// * `data_section` - Raw data section bytes
329/// * `offset` - Offset within data section to start validation
330/// * `path` - Set of offsets in the current traversal path (for cycle detection)
331/// * `depth` - Current depth in pointer chain
332///
333/// # Returns
334/// * `Ok(max_depth)` - Maximum depth reached
335/// * `Err` - Validation error encountered
336///
337/// # Note
338/// The `path` set tracks ancestors in the current traversal path, not all visited nodes.
339/// This allows legitimate pointer reuse (data deduplication) while still detecting true cycles
340/// where a value references itself or an ancestor.
341pub fn validate_data_value_pointers(
342    data_section: &[u8],
343    offset: usize,
344    path: &mut std::collections::HashSet<usize>,
345    depth: usize,
346) -> Result<usize, PointerValidationError> {
347    // Check depth limit
348    if depth > MAX_TOTAL_DEPTH {
349        return Err(PointerValidationError::DepthExceeded { depth });
350    }
351
352    // Check for cycles - only an error if this offset is an ancestor in the current path
353    if path.contains(&offset) {
354        return Err(PointerValidationError::Cycle { offset });
355    }
356
357    // Validate offset bounds before adding to path
358    if offset >= data_section.len() {
359        return Err(PointerValidationError::InvalidOffset {
360            offset,
361            reason: "Offset beyond data section".to_string(),
362        });
363    }
364
365    // Add to current path
366    path.insert(offset);
367
368    // Read control byte
369    let ctrl = data_section[offset];
370    let type_id = ctrl >> 5;
371    let payload = ctrl & 0x1F;
372
373    let mut cursor = offset + 1;
374    let mut max_child_depth = depth;
375
376    let result = (|| {
377        match type_id {
378            0 => {
379                // Extended type
380                if cursor >= data_section.len() {
381                    return Err(PointerValidationError::InvalidOffset {
382                        offset,
383                        reason: "Extended type truncated".to_string(),
384                    });
385                }
386                let raw_ext_type = data_section[cursor];
387                cursor += 1;
388                let ext_type_id = 7 + raw_ext_type;
389
390                match ext_type_id {
391                    11 => {
392                        // Array - validate all elements
393                        let count = decode_size_for_validation(data_section, &mut cursor, payload)?;
394                        for _ in 0..count {
395                            let child_depth = validate_data_value_pointers(
396                                data_section,
397                                cursor,
398                                path,
399                                depth + 1,
400                            )?;
401                            max_child_depth = max_child_depth.max(child_depth);
402                            cursor = skip_data_value(data_section, cursor)?;
403                        }
404                    }
405                    8 | 9 | 10 | 14 | 15 => {
406                        // Int32, Uint64, Uint128, Bool, Float - no pointers
407                    }
408                    _ => {
409                        return Err(PointerValidationError::InvalidType {
410                            offset,
411                            type_id: ext_type_id,
412                        });
413                    }
414                }
415            }
416            1 => {
417                // Pointer - critical to validate!
418                let pointer_offset = decode_pointer_offset(data_section, &mut cursor, payload)?;
419
420                // Validate pointer target
421                if pointer_offset >= data_section.len() {
422                    return Err(PointerValidationError::InvalidOffset {
423                        offset: pointer_offset,
424                        reason: "Pointer target beyond data section".to_string(),
425                    });
426                }
427
428                // Recursively validate pointed-to value
429                let child_depth =
430                    validate_data_value_pointers(data_section, pointer_offset, path, depth + 1)?;
431                max_child_depth = max_child_depth.max(child_depth);
432            }
433            2..=6 => {
434                // String, Double, Bytes, Uint16, Uint32 - no pointers
435            }
436            7 => {
437                // Map - validate all values
438                let count = decode_size_for_validation(data_section, &mut cursor, payload)?;
439                for _ in 0..count {
440                    // Skip key
441                    cursor = skip_data_value(data_section, cursor)?;
442                    // Validate value
443                    let child_depth =
444                        validate_data_value_pointers(data_section, cursor, path, depth + 1)?;
445                    max_child_depth = max_child_depth.max(child_depth);
446                    cursor = skip_data_value(data_section, cursor)?;
447                }
448            }
449            _ => {
450                return Err(PointerValidationError::InvalidType { offset, type_id });
451            }
452        }
453        Ok(max_child_depth)
454    })();
455
456    // Remove from path when backtracking (regardless of success/failure)
457    path.remove(&offset);
458
459    result
460}
461
462/// Decode size field for validation
463fn decode_size_for_validation(
464    data: &[u8],
465    cursor: &mut usize,
466    size_bits: u8,
467) -> Result<usize, PointerValidationError> {
468    match size_bits {
469        0..=28 => Ok(size_bits as usize),
470        29 => {
471            if *cursor >= data.len() {
472                return Err(PointerValidationError::InvalidOffset {
473                    offset: *cursor,
474                    reason: "Size byte out of bounds".to_string(),
475                });
476            }
477            let size = data[*cursor] as usize;
478            *cursor += 1;
479            Ok(29 + size)
480        }
481        30 => {
482            if *cursor + 2 > data.len() {
483                return Err(PointerValidationError::InvalidOffset {
484                    offset: *cursor,
485                    reason: "Size bytes out of bounds".to_string(),
486                });
487            }
488            let size = u16::from_be_bytes([data[*cursor], data[*cursor + 1]]) as usize;
489            *cursor += 2;
490            Ok(29 + 256 + size)
491        }
492        31 => {
493            if *cursor + 3 > data.len() {
494                return Err(PointerValidationError::InvalidOffset {
495                    offset: *cursor,
496                    reason: "Size bytes out of bounds".to_string(),
497                });
498            }
499            let b0 = data[*cursor] as usize;
500            let b1 = data[*cursor + 1] as usize;
501            let b2 = data[*cursor + 2] as usize;
502            *cursor += 3;
503            Ok(29 + 256 + 65536 + ((b0 << 16) | (b1 << 8) | b2))
504        }
505        _ => Err(PointerValidationError::InvalidOffset {
506            offset: *cursor,
507            reason: "Invalid size encoding".to_string(),
508        }),
509    }
510}
511
512/// Decode pointer offset for validation
513fn decode_pointer_offset(
514    data: &[u8],
515    cursor: &mut usize,
516    payload: u8,
517) -> Result<usize, PointerValidationError> {
518    let size_bits = (payload >> 3) & 0x3;
519
520    let offset = match size_bits {
521        0 => {
522            if *cursor >= data.len() {
523                return Err(PointerValidationError::InvalidOffset {
524                    offset: *cursor,
525                    reason: "Pointer data truncated".to_string(),
526                });
527            }
528            let low_3_bits = (payload & 0x7) as usize;
529            let next_byte = data[*cursor] as usize;
530            *cursor += 1;
531            (low_3_bits << 8) | next_byte
532        }
533        1 => {
534            if *cursor + 1 >= data.len() {
535                return Err(PointerValidationError::InvalidOffset {
536                    offset: *cursor,
537                    reason: "Pointer data truncated".to_string(),
538                });
539            }
540            let low_3_bits = (payload & 0x7) as usize;
541            let b0 = data[*cursor] as usize;
542            let b1 = data[*cursor + 1] as usize;
543            *cursor += 2;
544            2048 + ((low_3_bits << 16) | (b0 << 8) | b1)
545        }
546        2 => {
547            if *cursor + 2 >= data.len() {
548                return Err(PointerValidationError::InvalidOffset {
549                    offset: *cursor,
550                    reason: "Pointer data truncated".to_string(),
551                });
552            }
553            let low_3_bits = (payload & 0x7) as usize;
554            let b0 = data[*cursor] as usize;
555            let b1 = data[*cursor + 1] as usize;
556            let b2 = data[*cursor + 2] as usize;
557            *cursor += 3;
558            526336 + ((low_3_bits << 24) | (b0 << 16) | (b1 << 8) | b2)
559        }
560        3 => {
561            if *cursor + 3 >= data.len() {
562                return Err(PointerValidationError::InvalidOffset {
563                    offset: *cursor,
564                    reason: "Pointer data truncated".to_string(),
565                });
566            }
567            let b0 = data[*cursor] as usize;
568            let b1 = data[*cursor + 1] as usize;
569            let b2 = data[*cursor + 2] as usize;
570            let b3 = data[*cursor + 3] as usize;
571            *cursor += 4;
572            (b0 << 24) | (b1 << 16) | (b2 << 8) | b3
573        }
574        _ => {
575            return Err(PointerValidationError::InvalidOffset {
576                offset: *cursor,
577                reason: "Invalid pointer size bits".to_string(),
578            });
579        }
580    };
581
582    Ok(offset)
583}
584
585/// Skip past a data value (returns offset after the value)
586fn skip_data_value(data: &[u8], offset: usize) -> Result<usize, PointerValidationError> {
587    if offset >= data.len() {
588        return Err(PointerValidationError::InvalidOffset {
589            offset,
590            reason: "Offset beyond data".to_string(),
591        });
592    }
593
594    let ctrl = data[offset];
595    let type_id = ctrl >> 5;
596    let payload = ctrl & 0x1F;
597    let mut cursor = offset + 1;
598
599    match type_id {
600        0 => {
601            // Extended type
602            if cursor >= data.len() {
603                return Err(PointerValidationError::InvalidOffset {
604                    offset,
605                    reason: "Extended type truncated".to_string(),
606                });
607            }
608            cursor += 1; // Skip extended type byte
609            let size = decode_size_for_validation(data, &mut cursor, payload)?;
610            Ok(cursor + size)
611        }
612        1 => {
613            // Pointer
614            let size_bits = (payload >> 3) & 0x3;
615            let ptr_size = match size_bits {
616                0 => 1,
617                1 => 2,
618                2 => 3,
619                3 => 4,
620                _ => 0,
621            };
622            Ok(cursor + ptr_size)
623        }
624        2 | 4 => {
625            // String or Bytes
626            let size = decode_size_for_validation(data, &mut cursor, payload)?;
627            Ok(cursor + size)
628        }
629        3 => Ok(cursor + 8), // Double
630        5 => {
631            // Uint16
632            let size = decode_size_for_validation(data, &mut cursor, payload)?;
633            Ok(cursor + size.min(2))
634        }
635        6 => {
636            // Uint32
637            let size = decode_size_for_validation(data, &mut cursor, payload)?;
638            Ok(cursor + size.min(4))
639        }
640        7 => {
641            // Map
642            let count = decode_size_for_validation(data, &mut cursor, payload)?;
643            for _ in 0..count {
644                cursor = skip_data_value(data, cursor)?; // Skip key
645                cursor = skip_data_value(data, cursor)?; // Skip value
646            }
647            Ok(cursor)
648        }
649        _ => Err(PointerValidationError::InvalidType { offset, type_id }),
650    }
651}
652
653#[cfg(test)]
654mod tests {
655    use super::*;
656    use crate::DataEncoder;
657    use std::collections::HashMap;
658
659    #[test]
660    fn test_validate_simple_string() {
661        let mut encoder = DataEncoder::new();
662        let value = DataValue::String("test".to_string());
663        let offset = encoder.encode(&value);
664        let data = encoder.into_bytes();
665
666        let count = validate_data_value_utf8(&data, offset as usize, 0).unwrap();
667        assert_eq!(count, 1);
668    }
669
670    #[test]
671    fn test_validate_map_with_strings() {
672        let mut encoder = DataEncoder::new();
673        let mut map = HashMap::new();
674        map.insert("key1".to_string(), DataValue::String("value1".to_string()));
675        map.insert("key2".to_string(), DataValue::String("value2".to_string()));
676        map.insert("num".to_string(), DataValue::Uint32(42));
677
678        let value = DataValue::Map(map);
679        let offset = encoder.encode(&value);
680        let data = encoder.into_bytes();
681
682        let count = validate_data_value_utf8(&data, offset as usize, 0).unwrap();
683        // 3 keys + 2 string values = 5 strings total
684        // (Note: string interning may create pointers, but those are resolved during decode)
685        assert_eq!(count, 5);
686    }
687
688    #[test]
689    fn test_validate_nested_structure() {
690        let mut encoder = DataEncoder::new();
691
692        // Build nested structure with strings at various levels
693        let mut inner_map = HashMap::new();
694        inner_map.insert("inner".to_string(), DataValue::String("nested".to_string()));
695
696        let mut outer_map = HashMap::new();
697        outer_map.insert("outer".to_string(), DataValue::String("top".to_string()));
698        outer_map.insert("nested".to_string(), DataValue::Map(inner_map));
699
700        let value = DataValue::Map(outer_map);
701        let offset = encoder.encode(&value);
702        let data = encoder.into_bytes();
703
704        let count = validate_data_value_utf8(&data, offset as usize, 0).unwrap();
705        // Outer: 2 keys + 1 string value = 3
706        // Inner: 1 key + 1 string value = 2
707        // Total = 5 strings
708        assert_eq!(count, 5);
709    }
710
711    #[test]
712    fn test_validate_array_with_strings() {
713        let mut encoder = DataEncoder::new();
714        let value = DataValue::Array(vec![
715            DataValue::String("a".to_string()),
716            DataValue::String("b".to_string()),
717            DataValue::Uint32(123),
718        ]);
719
720        let offset = encoder.encode(&value);
721        let data = encoder.into_bytes();
722
723        let count = validate_data_value_utf8(&data, offset as usize, 0).unwrap();
724        assert_eq!(count, 2); // 2 strings in array
725    }
726
727    #[test]
728    fn test_validate_data_section() {
729        let mut encoder = DataEncoder::new();
730        let value1 = DataValue::String("first".to_string());
731        let value2 = DataValue::String("second".to_string());
732
733        let offset1 = encoder.encode(&value1);
734        let offset2 = encoder.encode(&value2);
735        let data = encoder.into_bytes();
736
737        let result = validate_data_section(&data, 0, &[offset1, offset2]);
738        assert!(result.is_valid());
739        assert_eq!(result.stats.values_checked, 2);
740        assert_eq!(result.stats.strings_checked, 2);
741    }
742
743    #[test]
744    fn test_validate_invalid_offset() {
745        // Create some actual data so we're not dealing with empty section warning
746        let mut encoder = DataEncoder::new();
747        encoder.encode(&DataValue::String("test".to_string()));
748        let data = encoder.into_bytes();
749
750        // Now try to validate an invalid offset
751        let result = validate_data_section(&data, 0, &[999]);
752        assert!(!result.is_valid());
753        assert!(!result.errors.is_empty());
754    }
755
756    #[test]
757    fn test_validate_empty_data_section() {
758        let data: Vec<u8> = Vec::new();
759        let result = validate_data_section(&data, 0, &[]);
760        // Empty is not an error, just a warning
761        assert!(result.is_valid());
762        assert_eq!(result.warnings.len(), 1);
763    }
764}