matchy_format/mmdb/
format.rs

1//! MMDB Binary Format Parsing
2//!
3//! This module handles parsing the MMDB binary format with minimal heap allocation.
4//! Only essential header information is extracted; everything else stays in mmap.
5//!
6//! Design:
7//! - Find metadata marker (slice search, no allocation)
8//! - Extract only: node_count, record_size, ip_version (~16 bytes on heap)
9//! - Tree traversal works with pure offsets (zero allocation)
10//! - Data decoding only allocates when returning results to users
11
12use super::types::{record_size_from_bits, IpVersion, MmdbError, RecordSize, METADATA_MARKER};
13use matchy_data_format::{DataDecoder, DataValue};
14
15/// MMDB file header - minimal heap usage
16///
17/// Contains only the essential information needed for IP lookups.
18/// Total heap usage: ~16 bytes.
19#[derive(Debug, Clone, Copy)]
20pub struct MmdbHeader {
21    /// Number of nodes in the search tree
22    pub node_count: u32,
23    /// Record size in bits (24, 28, or 32)
24    pub record_size: RecordSize,
25    /// IP version (4 or 6)
26    pub ip_version: IpVersion,
27    /// Size of the search tree in bytes
28    pub tree_size: usize,
29}
30
31impl MmdbHeader {
32    /// Parse MMDB file and extract minimal header information
33    ///
34    /// Only extracts fields needed for IP lookups. Metadata stays in mmap.
35    pub fn from_file(data: &[u8]) -> Result<Self, MmdbError> {
36        // Find metadata marker
37        let marker_offset = find_metadata_marker(data)?;
38
39        // Metadata comes AFTER the marker (verified from libmaxminddb source)
40        // The metadata section starts right after the marker bytes
41        let metadata_offset = marker_offset + METADATA_MARKER.len();
42        let metadata_bytes = &data[metadata_offset..];
43
44        // Decode metadata as MMDB data starting at offset 0
45        let decoder = DataDecoder::new(metadata_bytes, 0);
46        let metadata_value = decoder
47            .decode(0)
48            .map_err(|e| MmdbError::InvalidMetadata(format!("Failed to decode metadata: {e}")))?;
49
50        // Extract required fields (temporary allocation during parsing)
51        let (node_count, record_size_bits, ip_version_num) = match metadata_value {
52            DataValue::Map(ref map) => {
53                let node_count = extract_uint(map, "node_count")?;
54                let record_size = u16::try_from(extract_uint(map, "record_size")?)
55                    .map_err(|_| MmdbError::InvalidMetadata("record_size too large".to_string()))?;
56                let ip_version = extract_uint(map, "ip_version")?;
57                (node_count, record_size, ip_version)
58            }
59            _ => {
60                return Err(MmdbError::InvalidMetadata(
61                    "Metadata is not a map".to_string(),
62                ))
63            }
64        };
65
66        let record_size = record_size_from_bits(record_size_bits)?;
67
68        let ip_version = match ip_version_num {
69            4 => IpVersion::V4,
70            6 => IpVersion::V6,
71            _ => {
72                return Err(MmdbError::InvalidMetadata(format!(
73                    "Invalid IP version: {ip_version_num}"
74                )))
75            }
76        };
77
78        // Calculate tree size
79        let node_count_u32 = u32::try_from(node_count)
80            .map_err(|_| MmdbError::InvalidMetadata("node_count exceeds u32::MAX".to_string()))?;
81        let tree_size = usize::try_from(node_count)
82            .map_err(|_| MmdbError::InvalidMetadata("node_count exceeds usize".to_string()))?
83            * record_size.node_bytes();
84
85        Ok(Self {
86            node_count: node_count_u32,
87            record_size,
88            ip_version,
89            tree_size,
90        })
91    }
92}
93
94/// Optional metadata access (zero-copy, parses on-demand)
95///
96/// This provides access to non-essential metadata fields without
97/// allocating until actually requested.
98pub struct MmdbMetadata<'a> {
99    raw_data: &'a [u8],
100    metadata_offset: usize,
101}
102
103impl<'a> MmdbMetadata<'a> {
104    /// Create metadata accessor from mmap'd data
105    pub fn from_file(data: &'a [u8]) -> Result<Self, MmdbError> {
106        let metadata_start = find_metadata_marker(data)?;
107        let metadata_offset = metadata_start + METADATA_MARKER.len();
108
109        Ok(MmdbMetadata {
110            raw_data: data,
111            metadata_offset,
112        })
113    }
114
115    /// Get full metadata as DataValue (allocates on-demand)
116    pub fn as_value(&self) -> Result<DataValue, MmdbError> {
117        let decoder = DataDecoder::new(&self.raw_data[self.metadata_offset..], 0);
118        decoder
119            .decode(0)
120            .map_err(|e| MmdbError::InvalidMetadata(e.to_string()))
121    }
122}
123
124/// Find the metadata marker in MMDB file (zero allocation)
125///
126/// The marker "\xAB\xCD\xEFMaxMind.com" appears somewhere in the last 128KB
127/// of the file. The metadata comes AFTER the marker.
128///
129/// Note: If there are multiple markers (unlikely but possible), we want the LAST one.
130pub fn find_metadata_marker(data: &[u8]) -> Result<usize, MmdbError> {
131    const SEARCH_SIZE: usize = 128 * 1024; // 128KB
132
133    if data.len() < METADATA_MARKER.len() {
134        return Err(MmdbError::MetadataNotFound);
135    }
136
137    // Start searching from the end, but only within the last 128KB
138    let search_start = if data.len() > SEARCH_SIZE {
139        data.len() - SEARCH_SIZE
140    } else {
141        0
142    };
143
144    // Search for the marker, keeping track of the LAST occurrence
145    // (libmaxminddb does this to handle files with multiple markers)
146    let mut last_marker = None;
147    for i in search_start..=(data.len() - METADATA_MARKER.len()) {
148        if &data[i..i + METADATA_MARKER.len()] == METADATA_MARKER {
149            last_marker = Some(i);
150        }
151    }
152
153    last_marker.ok_or(MmdbError::MetadataNotFound)
154}
155
156// Helper functions to extract values from metadata map (temporary during parsing)
157
158fn extract_uint(
159    map: &std::collections::HashMap<String, DataValue>,
160    key: &str,
161) -> Result<u64, MmdbError> {
162    match map.get(key) {
163        Some(DataValue::Uint16(n)) => Ok(u64::from(*n)),
164        Some(DataValue::Uint32(n)) => Ok(u64::from(*n)),
165        Some(DataValue::Uint64(n)) => Ok(*n),
166        Some(_) => Err(MmdbError::InvalidMetadata(format!(
167            "Field '{key}' is not an unsigned integer"
168        ))),
169        None => Err(MmdbError::InvalidMetadata(format!(
170            "Required field '{key}' not found"
171        ))),
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn test_find_metadata_marker() {
181        let data = include_bytes!("../../tests/data/GeoLite2-Country.mmdb");
182        let marker_offset = find_metadata_marker(data);
183        assert!(marker_offset.is_ok(), "Should find metadata marker");
184
185        let offset = marker_offset.unwrap();
186        println!("Total file size: {} bytes", data.len());
187        println!("Marker found at offset: {offset}");
188        println!(
189            "Marker: {:?}",
190            &data[offset..offset + METADATA_MARKER.len()]
191        );
192
193        assert!(offset > 0, "Marker should not be at start of file");
194        assert_eq!(
195            &data[offset..offset + METADATA_MARKER.len()],
196            METADATA_MARKER
197        );
198
199        // Check what's around the marker
200        let after_marker = offset + METADATA_MARKER.len();
201        let before_marker = offset.saturating_sub(20);
202        println!(
203            "20 bytes before marker: {:02x?}",
204            &data[before_marker..offset]
205        );
206        println!(
207            "Bytes after marker: {} bytes remaining",
208            data.len() - after_marker
209        );
210        if data.len() > after_marker {
211            println!(
212                "First 20 bytes after marker: {:02x?}",
213                &data[after_marker..after_marker.min(data.len())]
214            );
215        }
216    }
217
218    #[test]
219    fn test_parse_header_minimal() {
220        let data = include_bytes!("../../tests/data/GeoLite2-Country.mmdb");
221        let header = MmdbHeader::from_file(data);
222        if let Err(ref e) = header {
223            println!("Error parsing header: {e}");
224        }
225        assert!(header.is_ok(), "Should parse header successfully");
226
227        let header = header.unwrap();
228        assert!(header.node_count > 0, "Should have nodes");
229        assert!(header.tree_size > 0, "Tree should have size");
230
231        // Record size should be valid
232        match header.record_size {
233            RecordSize::Bits24 | RecordSize::Bits28 | RecordSize::Bits32 => {}
234        }
235
236        // IP version should be valid
237        match header.ip_version {
238            IpVersion::V4 | IpVersion::V6 => {}
239        }
240
241        println!("Header: {header:?}");
242        println!("Heap usage: ~{} bytes", std::mem::size_of_val(&header));
243    }
244
245    #[test]
246    fn test_metadata_on_demand() {
247        let data = include_bytes!("../../tests/data/GeoLite2-Country.mmdb");
248        let metadata = MmdbMetadata::from_file(data);
249        assert!(metadata.is_ok(), "Should create metadata accessor");
250
251        let metadata = metadata.unwrap();
252
253        // Parse on-demand from mmap using as_value()
254        let metadata_value = metadata.as_value();
255        assert!(metadata_value.is_ok());
256
257        if let DataValue::Map(ref map) = metadata_value.unwrap() {
258            // Check database_type
259            if let Some(DataValue::String(db_type)) = map.get("database_type") {
260                assert_eq!(db_type, "GeoLite2-Country");
261            }
262
263            // Check build_epoch
264            if let Some(epoch_value) = map.get("build_epoch") {
265                let epoch_num = match epoch_value {
266                    DataValue::Uint32(n) => u64::from(*n),
267                    DataValue::Uint64(n) => *n,
268                    _ => panic!("build_epoch has unexpected type"),
269                };
270                println!("Build epoch: {epoch_num}");
271                assert!(epoch_num > 0);
272            }
273        } else {
274            panic!("Metadata should be a map");
275        }
276    }
277
278    #[test]
279    fn test_metadata_not_found() {
280        let data = b"not a valid mmdb file";
281        let result = find_metadata_marker(data);
282        assert!(result.is_err());
283        assert!(matches!(result, Err(MmdbError::MetadataNotFound)));
284    }
285}