matchy_format/
mmdb_builder.rs

1//! Unified MMDB Database Builder
2//!
3//! Builds MMDB-format databases containing both IP address data and pattern matching data.
4//! Automatically detects whether input rows are IP addresses (including CIDRs) or patterns.
5
6use crate::error::FormatError;
7use crate::mmdb::types::RecordSize;
8use crate::validation::EntryValidator;
9use matchy_data_format::{DataEncoder, DataValue};
10use matchy_ip_trie::IpTreeBuilder;
11use matchy_literal_hash::LiteralHashBuilder;
12use matchy_match_mode::MatchMode;
13use matchy_paraglob::ParaglobBuilder;
14use rustc_hash::FxHasher;
15use std::collections::HashMap;
16use std::hash::{Hash, Hasher};
17use std::net::IpAddr;
18
19/// Entry type classification
20#[derive(Debug, Clone)]
21pub enum EntryType {
22    /// IP address or CIDR block with prefix length
23    IpAddress {
24        /// IP address
25        addr: IpAddr,
26        /// Prefix length (0-32 for IPv4, 0-128 for IPv6)
27        prefix_len: u8,
28    },
29    /// Literal string (exact match, goes in hash table)
30    Literal(String),
31    /// Glob pattern (wildcard match, goes in Aho-Corasick)
32    Glob(String),
33}
34
35/// Lightweight entry reference (just entry type + offset, no data)
36#[derive(Debug, Clone)]
37struct EntryRef {
38    entry_type: EntryType,
39    data_offset: u32,
40}
41
42/// Unified database builder
43pub struct DatabaseBuilder {
44    entries: Vec<EntryRef>,
45    data_encoder: DataEncoder,
46    data_cache: HashMap<u64, u32>,
47    match_mode: MatchMode,
48    database_type: Option<String>,
49    description: HashMap<String, String>,
50    validator: Option<Box<dyn EntryValidator>>,
51    update_url: Option<String>,
52}
53
54impl DatabaseBuilder {
55    #[must_use]
56    pub fn new(match_mode: MatchMode) -> Self {
57        Self {
58            entries: Vec::new(),
59            data_encoder: DataEncoder::new(),
60            data_cache: HashMap::new(),
61            match_mode,
62            database_type: None,
63            description: HashMap::new(),
64            validator: None,
65            update_url: None,
66        }
67    }
68
69    /// Set a custom database type name
70    ///
71    /// If not set, defaults to "Paraglob-Combined-IP-Pattern" or "Paraglob-IP"
72    ///
73    /// # Example
74    /// ```
75    /// use matchy_format::DatabaseBuilder;
76    /// use matchy_match_mode::MatchMode;
77    ///
78    /// let builder = DatabaseBuilder::new(MatchMode::CaseSensitive)
79    ///     .with_database_type("MyCompany-ThreatIntel");
80    /// ```
81    #[must_use]
82    pub fn with_database_type(mut self, db_type: impl Into<String>) -> Self {
83        self.database_type = Some(db_type.into());
84        self
85    }
86
87    /// Add a description in a specific language
88    ///
89    /// Can be called multiple times for different languages.
90    /// If not called, defaults to English description.
91    ///
92    /// # Example
93    /// ```
94    /// use matchy_format::DatabaseBuilder;
95    /// use matchy_match_mode::MatchMode;
96    ///
97    /// let builder = DatabaseBuilder::new(MatchMode::CaseSensitive)
98    ///     .with_description("en", "My custom threat database")
99    ///     .with_description("es", "Mi base de datos de amenazas personalizada");
100    /// ```
101    #[must_use]
102    pub fn with_description(
103        mut self,
104        language: impl Into<String>,
105        text: impl Into<String>,
106    ) -> Self {
107        self.description.insert(language.into(), text.into());
108        self
109    }
110
111    /// Set an entry validator for schema validation
112    #[must_use]
113    pub fn with_validator(mut self, validator: Box<dyn EntryValidator>) -> Self {
114        self.validator = Some(validator);
115        self
116    }
117
118    /// Set the URL where updates to this database can be downloaded
119    ///
120    /// When set, this URL is stored in the database metadata. Applications can use
121    /// `Database::update_url()` to retrieve it and implement auto-update functionality.
122    ///
123    /// # Example
124    /// ```
125    /// use matchy_format::DatabaseBuilder;
126    /// use matchy_match_mode::MatchMode;
127    ///
128    /// let builder = DatabaseBuilder::new(MatchMode::CaseSensitive)
129    ///     .with_update_url("https://example.com/threats.mxy");
130    /// ```
131    #[must_use]
132    pub fn with_update_url(mut self, url: impl Into<String>) -> Self {
133        self.update_url = Some(url.into());
134        self
135    }
136
137    /// Set the match mode for pattern matching
138    ///
139    /// This controls whether literal and glob pattern matching is case-sensitive
140    /// or case-insensitive. IP address matching is always case-insensitive regardless
141    /// of this setting.
142    ///
143    /// # Example
144    /// ```
145    /// use matchy_format::DatabaseBuilder;
146    /// use matchy_match_mode::MatchMode;
147    ///
148    /// let builder = DatabaseBuilder::new(MatchMode::CaseSensitive)
149    ///     .with_match_mode(MatchMode::CaseInsensitive);
150    /// ```
151    #[must_use]
152    pub fn with_match_mode(mut self, match_mode: MatchMode) -> Self {
153        self.match_mode = match_mode;
154        self
155    }
156
157    /// Set the match mode (mutable borrow version)
158    ///
159    /// This is useful when you need to change the match mode after construction
160    /// without consuming the builder.
161    pub fn set_match_mode(&mut self, match_mode: MatchMode) {
162        self.match_mode = match_mode;
163    }
164
165    /// Validate entry data if a validator is configured
166    fn validate_entry(
167        &self,
168        key: &str,
169        data: &HashMap<String, DataValue>,
170    ) -> Result<(), FormatError> {
171        if let Some(ref validator) = self.validator {
172            validator
173                .validate(key, data)
174                .map_err(|e| FormatError::ValidationError(format!("{e}")))?;
175        }
176        Ok(())
177    }
178
179    /// Add an entry with auto-detection
180    ///
181    /// Automatically detects whether the key is an IP address, literal string, or glob pattern.
182    /// For explicit control, use `add_ip()`, `add_literal()`, or `add_glob()`.
183    ///
184    /// If a validator is configured via [`with_validator`](Self::with_validator), the entry
185    /// data will be validated before insertion. Returns an error if validation fails.
186    pub fn add_entry(
187        &mut self,
188        key: &str,
189        data: HashMap<String, DataValue>,
190    ) -> Result<(), FormatError> {
191        self.validate_entry(key, &data)?;
192        let entry_type = Self::detect_entry_type(key)?;
193        let data_offset = self.encode_and_deduplicate_data(data);
194
195        self.entries.push(EntryRef {
196            entry_type,
197            data_offset,
198        });
199
200        Ok(())
201    }
202
203    /// Add a literal string pattern (exact match only, no wildcards)
204    ///
205    /// Use this when the string contains characters like '*', '?', or '[' that should be
206    /// matched literally rather than as glob wildcards.
207    ///
208    /// If a validator is configured via [`with_validator`](Self::with_validator), the entry
209    /// data will be validated before insertion. Returns an error if validation fails.
210    ///
211    /// # Example
212    /// ```
213    /// # use matchy_format::DatabaseBuilder;
214    /// # use matchy_match_mode::MatchMode;
215    /// # use matchy_data_format::DataValue;
216    /// # use std::collections::HashMap;
217    /// let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
218    /// let mut data = HashMap::new();
219    /// data.insert("note".to_string(), DataValue::String("literal".to_string()));
220    ///
221    /// // This has '[' but we want to match it literally
222    /// builder.add_literal("file[1].txt", data)?;
223    /// # Ok::<(), Box<dyn std::error::Error>>(())
224    /// ```
225    pub fn add_literal(
226        &mut self,
227        pattern: &str,
228        data: HashMap<String, DataValue>,
229    ) -> Result<(), FormatError> {
230        self.validate_entry(pattern, &data)?;
231        let data_offset = self.encode_and_deduplicate_data(data);
232        self.entries.push(EntryRef {
233            entry_type: EntryType::Literal(pattern.to_string()),
234            data_offset,
235        });
236        Ok(())
237    }
238
239    /// Add a glob pattern (with wildcard matching)
240    ///
241    /// Use this to explicitly mark a pattern for glob matching, even if it doesn't
242    /// contain obvious wildcard characters.
243    ///
244    /// If a validator is configured via [`with_validator`](Self::with_validator), the entry
245    /// data will be validated before insertion. Returns an error if validation fails.
246    ///
247    /// # Example
248    /// ```
249    /// # use matchy_format::DatabaseBuilder;
250    /// # use matchy_match_mode::MatchMode;
251    /// # use matchy_data_format::DataValue;
252    /// # use std::collections::HashMap;
253    /// let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
254    /// let mut data = HashMap::new();
255    /// data.insert("category".to_string(), DataValue::String("malware".to_string()));
256    ///
257    /// builder.add_glob("*.evil.com", data)?;
258    /// # Ok::<(), Box<dyn std::error::Error>>(())
259    /// ```
260    pub fn add_glob(
261        &mut self,
262        pattern: &str,
263        data: HashMap<String, DataValue>,
264    ) -> Result<(), FormatError> {
265        self.validate_entry(pattern, &data)?;
266        let data_offset = self.encode_and_deduplicate_data(data);
267        self.entries.push(EntryRef {
268            entry_type: EntryType::Glob(pattern.to_string()),
269            data_offset,
270        });
271        Ok(())
272    }
273
274    /// Encode data and deduplicate to save memory
275    fn encode_and_deduplicate_data(&mut self, data: HashMap<String, DataValue>) -> u32 {
276        // Fast hash computation without string allocation
277        let data_value = DataValue::Map(data);
278        let mut hasher = FxHasher::default();
279        data_value.hash(&mut hasher);
280        let hash = hasher.finish();
281
282        // Check cache
283        if let Some(&offset) = self.data_cache.get(&hash) {
284            return offset;
285        }
286
287        // Encode and cache
288        let offset = self.data_encoder.encode(&data_value);
289        self.data_cache.insert(hash, offset);
290        offset
291    }
292
293    /// Add an IP address or CIDR block
294    ///
295    /// Use this to explicitly mark an entry as an IP address. Will return an error
296    /// if the string is not a valid IP address or CIDR notation.
297    ///
298    /// If a validator is configured via [`with_validator`](Self::with_validator), the entry
299    /// data will be validated before insertion. Returns an error if validation fails.
300    ///
301    /// # Arguments
302    /// * `ip_or_cidr` - IP address or CIDR range (e.g., "192.168.1.0/24")
303    /// * `data` - HashMap of key-value pairs to associate with the IP
304    ///
305    /// # Errors
306    /// Returns an error if the IP address or CIDR format is invalid, or if validation fails.
307    ///
308    /// # Example
309    /// ```
310    /// # use matchy_format::DatabaseBuilder;
311    /// # use matchy_match_mode::MatchMode;
312    /// # use matchy_data_format::DataValue;
313    /// # use std::collections::HashMap;
314    /// let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
315    /// let mut data = HashMap::new();
316    /// data.insert("country".to_string(), DataValue::String("US".to_string()));
317    ///
318    /// builder.add_ip("192.168.1.0/24", data)?;
319    /// # Ok::<(), Box<dyn std::error::Error>>(())
320    /// ```
321    pub fn add_ip(
322        &mut self,
323        ip_or_cidr: &str,
324        data: HashMap<String, DataValue>,
325    ) -> Result<(), FormatError> {
326        self.validate_entry(ip_or_cidr, &data)?;
327        let entry_type = Self::parse_ip_entry(ip_or_cidr)?;
328        let data_offset = self.encode_and_deduplicate_data(data);
329
330        self.entries.push(EntryRef {
331            entry_type,
332            data_offset,
333        });
334        Ok(())
335    }
336
337    /// Parse IP address or CIDR (used by add_ip)
338    fn parse_ip_entry(key: &str) -> Result<EntryType, FormatError> {
339        // Try parsing as plain IP address first
340        if let Ok(addr) = key.parse::<IpAddr>() {
341            let prefix_len = if addr.is_ipv4() { 32 } else { 128 };
342            return Ok(EntryType::IpAddress { addr, prefix_len });
343        }
344
345        // Check for CIDR notation
346        if let Some(slash_pos) = key.find('/') {
347            let addr_str = &key[..slash_pos];
348            let prefix_str = &key[slash_pos + 1..];
349
350            if let (Ok(addr), Ok(prefix_len)) =
351                (addr_str.parse::<IpAddr>(), prefix_str.parse::<u8>())
352            {
353                // Validate prefix length
354                let max_prefix = if addr.is_ipv4() { 32 } else { 128 };
355                if prefix_len <= max_prefix {
356                    return Ok(EntryType::IpAddress { addr, prefix_len });
357                }
358            }
359        }
360
361        Err(FormatError::InvalidPattern(format!(
362            "Invalid IP address or CIDR: {key}"
363        )))
364    }
365
366    /// Auto-detect if key is an IP/CIDR, literal, or glob pattern
367    ///
368    /// Supports explicit type prefixes for disambiguation:
369    /// - `literal:` - Force literal string matching (strips prefix)
370    /// - `glob:` - Force glob pattern matching (strips prefix)
371    /// - `ip:` - Force IP address parsing (strips prefix)
372    ///
373    /// Without a prefix, auto-detection is used:
374    /// 1. Try parsing as IP address/CIDR
375    /// 2. If contains glob chars (*, ?, [), validate as glob pattern
376    /// 3. Otherwise treat as literal string
377    ///
378    /// # Examples
379    /// ```
380    /// # use matchy_format::{DatabaseBuilder, EntryType};
381    /// # use matchy_match_mode::MatchMode;
382    /// // Auto-detection
383    /// assert!(matches!(DatabaseBuilder::detect_entry_type("1.2.3.4"), Ok(EntryType::IpAddress { .. })));
384    /// assert!(matches!(DatabaseBuilder::detect_entry_type("*.example.com"), Ok(EntryType::Glob(_))));
385    /// assert!(matches!(DatabaseBuilder::detect_entry_type("evil.com"), Ok(EntryType::Literal(_))));
386    ///
387    /// // Explicit type control
388    /// assert!(matches!(DatabaseBuilder::detect_entry_type("literal:*.not-a-glob.com"), Ok(EntryType::Literal(_))));
389    /// assert!(matches!(DatabaseBuilder::detect_entry_type("glob:no-wildcards.com"), Ok(EntryType::Glob(_))));
390    /// ```
391    pub fn detect_entry_type(key: &str) -> Result<EntryType, FormatError> {
392        // Check for explicit type prefixes first
393        if let Some(stripped) = key.strip_prefix("literal:") {
394            // Force literal matching - strip prefix and treat as literal
395            return Ok(EntryType::Literal(stripped.to_string()));
396        }
397
398        if let Some(stripped) = key.strip_prefix("glob:") {
399            // Force glob matching - strip prefix and validate as glob
400            matchy_paraglob::validate_glob_pattern(stripped).map_err(|e| {
401                FormatError::InvalidPattern(format!("Invalid glob pattern syntax: {e}"))
402            })?;
403            return Ok(EntryType::Glob(stripped.to_string()));
404        }
405
406        if let Some(stripped) = key.strip_prefix("ip:") {
407            // Force IP parsing - strip prefix and parse as IP
408            return Self::parse_ip_entry(stripped);
409        }
410
411        // No prefix - use auto-detection
412        // Try parsing as IP address first (most specific)
413        if Self::parse_ip_entry(key).is_ok() {
414            return Self::parse_ip_entry(key);
415        }
416
417        // Check for glob pattern characters - validate they form a valid glob
418        if key.contains('*') || key.contains('?') || key.contains('[') {
419            // Validate the glob syntax
420            if matchy_paraglob::validate_glob_pattern(key).is_ok() {
421                return Ok(EntryType::Glob(key.to_string()));
422            }
423            // If it contains glob-like chars but isn't a valid glob, treat as literal
424        }
425
426        // Otherwise, treat as literal string
427        Ok(EntryType::Literal(key.to_string()))
428    }
429
430    /// Build the unified MMDB database
431    pub fn build(mut self) -> Result<Vec<u8>, FormatError> {
432        // Data is already encoded - just extract from the builder
433        let data_section = self.data_encoder.into_bytes();
434
435        // Clear cache to free memory
436        self.data_cache.clear();
437
438        // Separate entries by type (using pre-encoded offsets)
439        // Pre-allocate with capacity to avoid reallocation
440        let entry_count = self.entries.len();
441        let mut ip_entries = Vec::with_capacity(entry_count);
442        let mut literal_entries = Vec::with_capacity(entry_count);
443        let mut glob_entries = Vec::with_capacity(entry_count);
444
445        for entry in &self.entries {
446            match &entry.entry_type {
447                EntryType::IpAddress { addr, prefix_len } => {
448                    ip_entries.push((*addr, *prefix_len, entry.data_offset));
449                }
450                EntryType::Literal(pattern) => {
451                    literal_entries.push((pattern.as_str(), entry.data_offset));
452                }
453                EntryType::Glob(pattern) => {
454                    glob_entries.push((pattern.as_str(), entry.data_offset));
455                }
456            }
457        }
458
459        // Always build IP tree structure (even if empty) to maintain MMDB format
460        // This ensures pattern-only databases still work with the Database API
461        let (ip_tree_bytes, node_count, record_size, ip_version) = if ip_entries.is_empty() {
462            // Empty IP tree - create minimal valid tree
463            let record_size = RecordSize::Bits24;
464            let tree_builder = IpTreeBuilder::new_v4(record_size);
465            let (tree_bytes, node_cnt) = tree_builder.build()?;
466            (tree_bytes, node_cnt, record_size, 4)
467        } else {
468            // Determine IP version needed
469            let needs_v6 = ip_entries.iter().any(|(addr, _, _)| addr.is_ipv6());
470
471            // Choose record size based on expected tree size
472            // For /32 IPs, worst case is ~ip_count nodes
473            // 24-bit: max 16,777,216 nodes (16M IPs)
474            // 28-bit: max 268,435,456 nodes (268M IPs)
475            // 32-bit: max 4,294,967,296 nodes (4.2B IPs)
476            let estimated_nodes = ip_entries.len();
477            let record_size = if estimated_nodes > 200_000_000 {
478                // Over 200M IPs - use 32-bit for safety
479                RecordSize::Bits32
480            } else if estimated_nodes > 15_000_000 {
481                // Over 15M IPs - use 28-bit
482                RecordSize::Bits28
483            } else {
484                // Under 15M IPs - use 24-bit (most common)
485                RecordSize::Bits24
486            };
487
488            // Sort IPs by prefix length (more specific first), then by address
489            // This minimizes tree reorganization and backfill operations
490            ip_entries.sort_unstable_by(|(addr1, prefix1, _), (addr2, prefix2, _)| {
491                prefix2.cmp(prefix1).then_with(|| addr1.cmp(addr2))
492            });
493
494            let mut tree_builder = if needs_v6 {
495                IpTreeBuilder::new_v6(record_size)
496            } else {
497                IpTreeBuilder::new_v4(record_size)
498            };
499
500            // Pre-allocate nodes (estimate: ~1.5x entries for typical CIDR distributions)
501            tree_builder.reserve_nodes(estimated_nodes + estimated_nodes / 2);
502
503            // Insert all IP entries using pre-encoded offsets
504            for (addr, prefix_len, data_offset) in &ip_entries {
505                tree_builder.insert(*addr, *prefix_len, *data_offset)?;
506            }
507
508            // Build the tree
509            let (tree_bytes, node_cnt) = tree_builder.build()?;
510
511            let ip_ver = if needs_v6 { 6 } else { 4 };
512            (tree_bytes, node_cnt, record_size, ip_ver)
513        };
514
515        // Build glob pattern section if we have glob entries (NOT literals)
516        let (has_globs, glob_section_bytes) = if glob_entries.is_empty() {
517            (false, Vec::new())
518        } else {
519            let mut pattern_builder = ParaglobBuilder::new(self.match_mode);
520            let mut pattern_data = Vec::with_capacity(glob_entries.len());
521
522            for (pattern, data_offset) in &glob_entries {
523                let pattern_id = pattern_builder.add_pattern(pattern)?;
524                pattern_data.push((pattern_id, *data_offset));
525            }
526
527            let paraglob = pattern_builder.build()?;
528            let paraglob_bytes = paraglob.buffer().to_vec();
529
530            // Build complete pattern section: [total_size][paraglob_size][paraglob_data][mappings]
531            let mut section = Vec::new();
532
533            // Will fill in sizes at the end
534            let size_placeholder = vec![0u8; 8]; // 2 u32s
535            section.extend_from_slice(&size_placeholder);
536
537            // Paraglob data
538            section.extend_from_slice(&paraglob_bytes);
539
540            // Mappings: pattern_count + data offsets
541            let pattern_count =
542                u32::try_from(pattern_data.len()).expect("Pattern count exceeds u32::MAX");
543            section.extend_from_slice(&pattern_count.to_le_bytes());
544            for (_pattern_id, data_offset) in pattern_data {
545                section.extend_from_slice(&data_offset.to_le_bytes());
546            }
547
548            // Fill in sizes
549            let total_size =
550                u32::try_from(section.len()).expect("Glob section exceeds u32::MAX bytes");
551            let paraglob_size =
552                u32::try_from(paraglob_bytes.len()).expect("Paraglob data exceeds u32::MAX bytes");
553            section[0..4].copy_from_slice(&total_size.to_le_bytes());
554            section[4..8].copy_from_slice(&paraglob_size.to_le_bytes());
555
556            (true, section)
557        };
558
559        // Build literal hash table section for literal_entries
560        let (has_literals, literal_section_bytes) = if literal_entries.is_empty() {
561            (false, Vec::new())
562        } else {
563            let mut literal_builder = LiteralHashBuilder::new(self.match_mode);
564            let mut literal_pattern_data = Vec::with_capacity(literal_entries.len());
565
566            for (next_pattern_id, (literal, data_offset)) in literal_entries.iter().enumerate() {
567                let pid = u32::try_from(next_pattern_id).expect("Literal pattern ID exceeds u32");
568                literal_builder.add_pattern(literal, pid);
569                literal_pattern_data.push((pid, *data_offset));
570            }
571
572            let literal_bytes = literal_builder.build(&literal_pattern_data)?;
573            (true, literal_bytes)
574        };
575
576        // Assemble final database - always use MMDB format
577        let mut database = Vec::new();
578
579        // IP tree (empty or populated)
580        database.extend_from_slice(&ip_tree_bytes);
581        database
582            .extend_from_slice(b"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00"); // 16-byte separator
583
584        // Data section
585        database.extend_from_slice(&data_section);
586
587        // Add padding to ensure paraglob section (if present) starts at 4-byte aligned offset
588        // ParaglobHeader requires 4-byte alignment for zerocopy
589        if has_globs {
590            let current_offset = database.len() + 16; // +16 for "MMDB_PATTERN" separator
591            let padding_needed = (4 - (current_offset % 4)) % 4;
592            database.extend(std::iter::repeat_n(0u8, padding_needed));
593        }
594
595        // Add MMDB metadata section (always present)
596        {
597            // Build metadata map
598            let mut metadata = HashMap::new();
599            metadata.insert(
600                "binary_format_major_version".to_string(),
601                DataValue::Uint16(2),
602            );
603            metadata.insert(
604                "binary_format_minor_version".to_string(),
605                DataValue::Uint16(0),
606            );
607            metadata.insert(
608                "build_epoch".to_string(),
609                DataValue::Uint64(
610                    web_time::SystemTime::now()
611                        .duration_since(web_time::UNIX_EPOCH)
612                        .map(|d| d.as_secs())
613                        .unwrap_or(0),
614                ),
615            );
616            // Database type - use custom if provided, otherwise auto-generate
617            let db_type = self.database_type.clone().unwrap_or_else(|| {
618                if has_globs || !literal_entries.is_empty() {
619                    if ip_entries.is_empty() {
620                        "Paraglob-Pattern".to_string()
621                    } else {
622                        "Paraglob-Combined-IP-Pattern".to_string()
623                    }
624                } else {
625                    "Paraglob-IP".to_string()
626                }
627            });
628            metadata.insert("database_type".to_string(), DataValue::String(db_type));
629
630            // Description - use custom if provided, otherwise use default
631            let description_map = if self.description.is_empty() {
632                let mut desc = HashMap::new();
633                desc.insert(
634                    "en".to_string(),
635                    DataValue::String(
636                        "Paraglob unified database with IP and pattern matching".to_string(),
637                    ),
638                );
639                desc
640            } else {
641                self.description
642                    .iter()
643                    .map(|(k, v)| (k.clone(), DataValue::String(v.clone())))
644                    .collect()
645            };
646            metadata.insert("description".to_string(), DataValue::Map(description_map));
647            metadata.insert(
648                "languages".to_string(),
649                DataValue::Array(vec![DataValue::String("en".to_string())]),
650            );
651            metadata.insert(
652                "ip_version".to_string(),
653                DataValue::Uint16(u16::try_from(ip_version).unwrap()),
654            );
655            metadata.insert("node_count".to_string(), DataValue::Uint32(node_count));
656            metadata.insert(
657                "record_size".to_string(),
658                DataValue::Uint16(match record_size {
659                    RecordSize::Bits24 => 24,
660                    RecordSize::Bits28 => 28,
661                    RecordSize::Bits32 => 32,
662                }),
663            );
664
665            // Add entry counts for easy inspection
666            metadata.insert(
667                "ip_entry_count".to_string(),
668                DataValue::Uint32(u32::try_from(ip_entries.len()).unwrap_or(u32::MAX)),
669            );
670            metadata.insert(
671                "literal_entry_count".to_string(),
672                DataValue::Uint32(u32::try_from(literal_entries.len()).unwrap_or(u32::MAX)),
673            );
674            metadata.insert(
675                "glob_entry_count".to_string(),
676                DataValue::Uint32(u32::try_from(glob_entries.len()).unwrap_or(u32::MAX)),
677            );
678
679            // Store match mode (0 = CaseSensitive, 1 = CaseInsensitive)
680            let match_mode_value = match self.match_mode {
681                MatchMode::CaseSensitive => 0u16,
682                MatchMode::CaseInsensitive => 1u16,
683            };
684            metadata.insert(
685                "match_mode".to_string(),
686                DataValue::Uint16(match_mode_value),
687            );
688
689            if let Some(ref url) = self.update_url {
690                metadata.insert("update_url".to_string(), DataValue::String(url.clone()));
691            }
692
693            // ALWAYS write section offset fields for fast loading (0 = not present)
694            // This eliminates the need to scan the entire file for separators
695            let tree_and_separator_size = ip_tree_bytes.len() + 16;
696            let data_section_size = data_section.len();
697
698            // Calculate padding before paraglob section for 4-byte alignment
699            let padding_before_paraglob = if has_globs {
700                let current_offset = tree_and_separator_size + data_section_size + 16; // +16 for separator
701                (4 - (current_offset % 4)) % 4
702            } else {
703                0
704            };
705
706            // Pattern section offset (after tree + separator + data section + padding)
707            // 0 means no pattern section present
708            let pattern_offset = if has_globs {
709                tree_and_separator_size + data_section_size + padding_before_paraglob + 16
710            // +16 for "MMDB_PATTERN" separator
711            } else {
712                0 // No pattern section
713            };
714            metadata.insert(
715                "pattern_section_offset".to_string(),
716                DataValue::Uint32(
717                    u32::try_from(pattern_offset).expect("Pattern section offset exceeds u32::MAX"),
718                ),
719            );
720
721            // Literal section offset (after pattern section if present)
722            // 0 means no literal section present
723            let literal_offset = if has_literals {
724                if has_globs {
725                    tree_and_separator_size
726                        + data_section_size
727                        + padding_before_paraglob
728                        + 16
729                        + glob_section_bytes.len()
730                        + 16
731                } else {
732                    tree_and_separator_size + data_section_size + 16 // +16 for "MMDB_LITERAL" separator
733                }
734            } else {
735                0 // No literal section
736            };
737            metadata.insert(
738                "literal_section_offset".to_string(),
739                DataValue::Uint32(
740                    u32::try_from(literal_offset).expect("Literal section offset exceeds u32::MAX"),
741                ),
742            );
743
744            // Encode metadata
745            let mut meta_encoder = DataEncoder::new();
746            let metadata_value = DataValue::Map(metadata);
747            meta_encoder.encode(&metadata_value);
748            let metadata_bytes = meta_encoder.into_bytes();
749
750            // Save metadata for end of file (will be added after pattern section)
751            // This ensures it's in the last 128KB for the metadata marker search
752
753            // Add MMDB_PATTERN separator before globs (if any)
754            if has_globs {
755                database.extend_from_slice(b"MMDB_PATTERN\x00\x00\x00\x00");
756                database.extend_from_slice(&glob_section_bytes);
757            }
758
759            // Add MMDB_LITERAL separator before literals (if any)
760            if has_literals {
761                database.extend_from_slice(b"MMDB_LITERAL\x00\x00\x00\x00");
762                database.extend_from_slice(&literal_section_bytes);
763            }
764
765            // Add metadata at the END of the file so it's within the 128KB search window
766            database.extend_from_slice(b"\xAB\xCD\xEFMaxMind.com");
767            database.extend_from_slice(&metadata_bytes);
768        }
769
770        Ok(database)
771    }
772
773    /// Get statistics about the builder
774    #[must_use]
775    pub fn stats(&self) -> BuilderStats {
776        let mut ip_count = 0;
777        let mut literal_count = 0;
778        let mut glob_count = 0;
779
780        for entry in &self.entries {
781            match &entry.entry_type {
782                EntryType::IpAddress { .. } => ip_count += 1,
783                EntryType::Literal(_) => literal_count += 1,
784                EntryType::Glob(_) => glob_count += 1,
785            }
786        }
787
788        BuilderStats {
789            total_entries: self.entries.len(),
790            ip_entries: ip_count,
791            literal_entries: literal_count,
792            glob_entries: glob_count,
793        }
794    }
795}
796
797/// Builder statistics
798#[derive(Debug, Clone)]
799pub struct BuilderStats {
800    /// Total number of entries added
801    pub total_entries: usize,
802    /// Number of IP address/CIDR entries
803    pub ip_entries: usize,
804    /// Number of literal string entries (exact match)
805    pub literal_entries: usize,
806    /// Number of glob pattern entries (wildcard match)
807    pub glob_entries: usize,
808}
809
810#[cfg(test)]
811mod tests {
812    use super::*;
813
814    #[test]
815    fn test_detect_ip_address() {
816        let result = DatabaseBuilder::detect_entry_type("8.8.8.8").unwrap();
817        match result {
818            EntryType::IpAddress { addr, prefix_len } => {
819                assert_eq!(addr.to_string(), "8.8.8.8");
820                assert_eq!(prefix_len, 32);
821            }
822            _ => panic!("Expected IP address"),
823        }
824    }
825
826    #[test]
827    fn test_detect_cidr() {
828        let result = DatabaseBuilder::detect_entry_type("192.168.0.0/16").unwrap();
829        match result {
830            EntryType::IpAddress { addr, prefix_len } => {
831                assert_eq!(addr.to_string(), "192.168.0.0");
832                assert_eq!(prefix_len, 16);
833            }
834            _ => panic!("Expected CIDR"),
835        }
836    }
837
838    #[test]
839    fn test_detect_ipv6() {
840        let result = DatabaseBuilder::detect_entry_type("2001:4860:4860::8888").unwrap();
841        match result {
842            EntryType::IpAddress { addr, prefix_len } => {
843                assert!(addr.is_ipv6());
844                assert_eq!(prefix_len, 128);
845            }
846            _ => panic!("Expected IPv6"),
847        }
848    }
849
850    #[test]
851    fn test_detect_pattern_wildcard() {
852        let result = DatabaseBuilder::detect_entry_type("*.evil.com").unwrap();
853        match result {
854            EntryType::Glob(p) => assert_eq!(p, "*.evil.com"),
855            _ => panic!("Expected glob pattern"),
856        }
857    }
858
859    #[test]
860    fn test_detect_pattern_literal() {
861        let result = DatabaseBuilder::detect_entry_type("evil.com").unwrap();
862        match result {
863            EntryType::Literal(p) => assert_eq!(p, "evil.com"),
864            _ => panic!("Expected literal pattern"),
865        }
866    }
867
868    // ========== Prefix Convention Tests ==========
869
870    #[test]
871    fn test_literal_prefix_forces_literal() {
872        // String with glob chars should normally be a glob, but prefix forces literal
873        let result = DatabaseBuilder::detect_entry_type("literal:*.not-a-glob.com").unwrap();
874        match result {
875            EntryType::Literal(p) => assert_eq!(p, "*.not-a-glob.com"),
876            _ => panic!("Expected literal, got: {result:?}"),
877        }
878    }
879
880    #[test]
881    fn test_literal_prefix_strips_correctly() {
882        let result = DatabaseBuilder::detect_entry_type("literal:evil.example.com").unwrap();
883        match result {
884            EntryType::Literal(p) => {
885                assert_eq!(p, "evil.example.com");
886                assert!(!p.starts_with("literal:"));
887            }
888            _ => panic!("Expected literal"),
889        }
890    }
891
892    #[test]
893    fn test_glob_prefix_forces_glob() {
894        // String without wildcards should normally be literal, but prefix forces glob
895        let result = DatabaseBuilder::detect_entry_type("glob:no-wildcards.com").unwrap();
896        match result {
897            EntryType::Glob(p) => assert_eq!(p, "no-wildcards.com"),
898            _ => panic!("Expected glob, got: {result:?}"),
899        }
900    }
901
902    #[test]
903    fn test_glob_prefix_with_wildcards() {
904        let result = DatabaseBuilder::detect_entry_type("glob:*.evil.com").unwrap();
905        match result {
906            EntryType::Glob(p) => {
907                assert_eq!(p, "*.evil.com");
908                assert!(!p.starts_with("glob:"));
909            }
910            _ => panic!("Expected glob"),
911        }
912    }
913
914    #[test]
915    fn test_glob_prefix_invalid_pattern() {
916        // If explicitly marked as glob but has invalid glob syntax, should error
917        let result = DatabaseBuilder::detect_entry_type("glob:[unclosed");
918        assert!(result.is_err());
919        assert!(result
920            .unwrap_err()
921            .to_string()
922            .contains("Invalid glob pattern syntax"));
923    }
924
925    #[test]
926    fn test_ip_prefix_forces_ip() {
927        let result = DatabaseBuilder::detect_entry_type("ip:8.8.8.8").unwrap();
928        match result {
929            EntryType::IpAddress { addr, prefix_len } => {
930                assert_eq!(addr.to_string(), "8.8.8.8");
931                assert_eq!(prefix_len, 32);
932            }
933            _ => panic!("Expected IP address"),
934        }
935    }
936
937    #[test]
938    fn test_ip_prefix_with_cidr() {
939        let result = DatabaseBuilder::detect_entry_type("ip:10.0.0.0/8").unwrap();
940        match result {
941            EntryType::IpAddress { addr, prefix_len } => {
942                assert_eq!(addr.to_string(), "10.0.0.0");
943                assert_eq!(prefix_len, 8);
944            }
945            _ => panic!("Expected CIDR"),
946        }
947    }
948
949    #[test]
950    fn test_ip_prefix_invalid_ip() {
951        let result = DatabaseBuilder::detect_entry_type("ip:not-an-ip");
952        assert!(result.is_err());
953    }
954
955    #[test]
956    fn test_auto_detection_still_works() {
957        // Without prefix, auto-detection should work as before
958        assert!(matches!(
959            DatabaseBuilder::detect_entry_type("1.2.3.4"),
960            Ok(EntryType::IpAddress { .. })
961        ));
962        assert!(matches!(
963            DatabaseBuilder::detect_entry_type("*.example.com"),
964            Ok(EntryType::Glob(_))
965        ));
966        assert!(matches!(
967            DatabaseBuilder::detect_entry_type("example.com"),
968            Ok(EntryType::Literal(_))
969        ));
970    }
971
972    #[test]
973    fn test_prefix_case_sensitive() {
974        // Prefixes should be case-sensitive
975        let result = DatabaseBuilder::detect_entry_type("LITERAL:test.com").unwrap();
976        // Should not match prefix, should auto-detect as literal
977        match result {
978            EntryType::Literal(p) => {
979                // Should include the LITERAL: prefix since it wasn't recognized
980                assert_eq!(p, "LITERAL:test.com");
981            }
982            _ => panic!("Expected literal"),
983        }
984    }
985
986    #[test]
987    fn test_literal_prefix_with_question_mark() {
988        let result = DatabaseBuilder::detect_entry_type("literal:file?.txt").unwrap();
989        match result {
990            EntryType::Literal(p) => assert_eq!(p, "file?.txt"),
991            _ => panic!("Expected literal"),
992        }
993    }
994
995    #[test]
996    fn test_literal_prefix_with_brackets() {
997        let result = DatabaseBuilder::detect_entry_type("literal:file[1].txt").unwrap();
998        match result {
999            EntryType::Literal(p) => assert_eq!(p, "file[1].txt"),
1000            _ => panic!("Expected literal"),
1001        }
1002    }
1003
1004    #[test]
1005    fn test_builder_add_entry_with_prefix() {
1006        // Integration test: add_entry should respect prefixes
1007        let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
1008
1009        // Force literal for a string that looks like a glob
1010        builder
1011            .add_entry("literal:*.test.com", HashMap::new())
1012            .unwrap();
1013
1014        let stats = builder.stats();
1015        assert_eq!(stats.literal_entries, 1);
1016        assert_eq!(stats.glob_entries, 0);
1017    }
1018
1019    #[test]
1020    fn test_builder_add_entry_glob_prefix() {
1021        let mut builder = DatabaseBuilder::new(MatchMode::CaseSensitive);
1022
1023        // Force glob for a string without wildcards
1024        builder.add_entry("glob:test.com", HashMap::new()).unwrap();
1025
1026        let stats = builder.stats();
1027        assert_eq!(stats.glob_entries, 1);
1028        assert_eq!(stats.literal_entries, 0);
1029    }
1030
1031    #[test]
1032    fn test_empty_prefix_value() {
1033        // Edge case: what if someone uses "literal:" with nothing after?
1034        let result = DatabaseBuilder::detect_entry_type("literal:").unwrap();
1035        match result {
1036            EntryType::Literal(p) => assert_eq!(p, ""),
1037            _ => panic!("Expected literal"),
1038        }
1039    }
1040}