ngdp_client/
pattern_extraction.rs

1//! Pattern-based file extraction with glob, regex, and key matching support
2
3use regex::Regex;
4use std::collections::HashSet;
5use std::path::Path;
6use thiserror::Error;
7use tracing::{debug, info, warn};
8
9#[derive(Error, Debug)]
10pub enum PatternError {
11    #[error("Invalid regex pattern: {0}")]
12    InvalidRegex(#[from] regex::Error),
13    #[error("Invalid content key format: {0}")]
14    InvalidContentKey(String),
15    #[error("Invalid encoding key format: {0}")]
16    InvalidEncodingKey(String),
17    #[error("Pattern type could not be determined: {0}")]
18    UnknownPattern(String),
19}
20
21/// Types of patterns we can match against
22#[derive(Debug, Clone)]
23pub enum PatternType {
24    /// Glob pattern (e.g., "*.dbc", "interface/**/*.lua")
25    Glob(String),
26    /// Regular expression (e.g., r"/sound/.*\.ogg$/")
27    Regex(Regex),
28    /// 32-character hex content key
29    ContentKey(String),
30    /// 18-character hex encoding key
31    EncodingKey(String),
32    /// Exact file path match
33    FilePath(String),
34}
35
36impl PartialEq for PatternType {
37    fn eq(&self, other: &Self) -> bool {
38        match (self, other) {
39            (PatternType::Glob(a), PatternType::Glob(b)) => a == b,
40            (PatternType::Regex(a), PatternType::Regex(b)) => a.as_str() == b.as_str(),
41            (PatternType::ContentKey(a), PatternType::ContentKey(b)) => a == b,
42            (PatternType::EncodingKey(a), PatternType::EncodingKey(b)) => a == b,
43            (PatternType::FilePath(a), PatternType::FilePath(b)) => a == b,
44            _ => false,
45        }
46    }
47}
48
49/// Configuration for pattern matching behavior
50#[derive(Debug, Clone)]
51pub struct PatternConfig {
52    /// Case-sensitive matching (default: false)
53    pub case_sensitive: bool,
54    /// Maximum number of files to match per pattern (default: unlimited)
55    pub max_matches_per_pattern: Option<usize>,
56    /// Whether to include directories in matches (default: false)
57    pub include_directories: bool,
58    /// File extensions to prioritize when multiple matches exist
59    pub priority_extensions: Vec<String>,
60}
61
62impl Default for PatternConfig {
63    fn default() -> Self {
64        Self {
65            case_sensitive: false,
66            max_matches_per_pattern: None,
67            include_directories: false,
68            priority_extensions: vec!["dbc".to_string(), "db2".to_string(), "lua".to_string()],
69        }
70    }
71}
72
73/// A compiled pattern ready for matching
74#[derive(Debug)]
75pub struct CompiledPattern {
76    pub pattern_type: PatternType,
77    pub original: String,
78    pub config: PatternConfig,
79}
80
81/// Results from pattern matching
82#[derive(Debug, Clone)]
83pub struct PatternMatch {
84    /// The file path that matched
85    pub file_path: String,
86    /// The pattern that caused the match
87    pub pattern: String,
88    /// Additional metadata about the match
89    pub metadata: MatchMetadata,
90}
91
92/// Additional information about a pattern match
93#[derive(Debug, Clone, Default)]
94pub struct MatchMetadata {
95    /// File size if known
96    pub file_size: Option<u64>,
97    /// Content key if known
98    pub content_key: Option<String>,
99    /// Encoding key if known
100    pub encoding_key: Option<String>,
101    /// File type detected from extension
102    pub file_type: Option<String>,
103    /// Priority score (higher = more important)
104    pub priority_score: u32,
105}
106
107/// Pattern extraction engine
108pub struct PatternExtractor {
109    config: PatternConfig,
110    compiled_patterns: Vec<CompiledPattern>,
111}
112
113impl PatternExtractor {
114    /// Create a new pattern extractor with default configuration
115    pub fn new() -> Self {
116        Self {
117            config: PatternConfig::default(),
118            compiled_patterns: Vec::new(),
119        }
120    }
121
122    /// Create a pattern extractor with custom configuration
123    pub fn with_config(config: PatternConfig) -> Self {
124        Self {
125            config,
126            compiled_patterns: Vec::new(),
127        }
128    }
129
130    /// Add a pattern to the extractor
131    pub fn add_pattern(&mut self, pattern: &str) -> Result<(), PatternError> {
132        let pattern_type = self.detect_pattern_type(pattern)?;
133        let compiled = CompiledPattern {
134            pattern_type,
135            original: pattern.to_string(),
136            config: self.config.clone(),
137        };
138
139        info!("Added pattern: {} -> {:?}", pattern, compiled.pattern_type);
140        self.compiled_patterns.push(compiled);
141        Ok(())
142    }
143
144    /// Add multiple patterns at once
145    pub fn add_patterns(&mut self, patterns: &[String]) -> Result<(), PatternError> {
146        for pattern in patterns {
147            self.add_pattern(pattern)?;
148        }
149        Ok(())
150    }
151
152    /// Detect what type of pattern this is
153    fn detect_pattern_type(&self, pattern: &str) -> Result<PatternType, PatternError> {
154        // Check if it's a regex pattern (starts with / and ends with /)
155        if pattern.starts_with('/') && pattern.ends_with('/') && pattern.len() > 2 {
156            let regex_str = &pattern[1..pattern.len() - 1];
157            let regex = if self.config.case_sensitive {
158                Regex::new(regex_str)?
159            } else {
160                Regex::new(&format!("(?i){regex_str}"))?
161            };
162            return Ok(PatternType::Regex(regex));
163        }
164
165        // Check if it's a content key (32 hex characters)
166        if pattern.len() == 32 && pattern.chars().all(|c| c.is_ascii_hexdigit()) {
167            return Ok(PatternType::ContentKey(pattern.to_lowercase()));
168        }
169
170        // Check if it's an encoding key (18 hex characters)
171        if pattern.len() == 18 && pattern.chars().all(|c| c.is_ascii_hexdigit()) {
172            return Ok(PatternType::EncodingKey(pattern.to_lowercase()));
173        }
174
175        // Check if it contains glob characters
176        if pattern.contains('*')
177            || pattern.contains('?')
178            || pattern.contains('[')
179            || pattern.contains('{')
180        {
181            return Ok(PatternType::Glob(pattern.to_string()));
182        }
183
184        // Default to file path
185        Ok(PatternType::FilePath(pattern.to_string()))
186    }
187
188    /// Match patterns against a list of file paths
189    pub fn match_files(&self, file_paths: &[String]) -> Vec<PatternMatch> {
190        let mut matches = Vec::new();
191        let mut seen_files = HashSet::new();
192
193        info!(
194            "Matching {} patterns against {} files",
195            self.compiled_patterns.len(),
196            file_paths.len()
197        );
198
199        for compiled_pattern in &self.compiled_patterns {
200            let pattern_matches = self.match_pattern(compiled_pattern, file_paths);
201
202            debug!(
203                "Pattern '{}' matched {} files",
204                compiled_pattern.original,
205                pattern_matches.len()
206            );
207
208            // Apply limits and deduplication
209            let mut added_for_pattern = 0;
210            for mut pattern_match in pattern_matches {
211                if seen_files.contains(&pattern_match.file_path) {
212                    continue;
213                }
214
215                // Apply per-pattern limit
216                if let Some(limit) = compiled_pattern.config.max_matches_per_pattern {
217                    if added_for_pattern >= limit {
218                        debug!(
219                            "Reached limit of {} matches for pattern '{}'",
220                            limit, compiled_pattern.original
221                        );
222                        break;
223                    }
224                }
225
226                // Calculate priority score
227                pattern_match.metadata.priority_score = self.calculate_priority(&pattern_match);
228
229                seen_files.insert(pattern_match.file_path.clone());
230                matches.push(pattern_match);
231                added_for_pattern += 1;
232            }
233        }
234
235        // Sort by priority score (descending)
236        matches.sort_by(|a, b| b.metadata.priority_score.cmp(&a.metadata.priority_score));
237
238        info!("Total matches found: {}", matches.len());
239        matches
240    }
241
242    /// Match a single compiled pattern against file paths
243    fn match_pattern(
244        &self,
245        compiled_pattern: &CompiledPattern,
246        file_paths: &[String],
247    ) -> Vec<PatternMatch> {
248        match &compiled_pattern.pattern_type {
249            PatternType::Glob(glob_pattern) => {
250                self.match_glob_pattern(glob_pattern, file_paths, &compiled_pattern.original)
251            }
252            PatternType::Regex(regex) => {
253                self.match_regex_pattern(regex, file_paths, &compiled_pattern.original)
254            }
255            PatternType::ContentKey(ckey) => {
256                self.match_content_key(ckey, &compiled_pattern.original)
257            }
258            PatternType::EncodingKey(ekey) => {
259                self.match_encoding_key(ekey, &compiled_pattern.original)
260            }
261            PatternType::FilePath(path) => {
262                self.match_file_path(path, file_paths, &compiled_pattern.original)
263            }
264        }
265    }
266
267    /// Match glob patterns like "*.dbc" or "interface/**/*.lua"
268    fn match_glob_pattern(
269        &self,
270        glob_pattern: &str,
271        file_paths: &[String],
272        original: &str,
273    ) -> Vec<PatternMatch> {
274        let mut matches = Vec::new();
275
276        // Convert glob to regex
277        let regex_pattern = self.glob_to_regex(glob_pattern);
278        let regex = match Regex::new(&regex_pattern) {
279            Ok(r) => r,
280            Err(e) => {
281                warn!(
282                    "Failed to compile glob pattern '{}' to regex: {}",
283                    glob_pattern, e
284                );
285                return matches;
286            }
287        };
288
289        for file_path in file_paths {
290            let test_path = if self.config.case_sensitive {
291                file_path.clone()
292            } else {
293                file_path.to_lowercase()
294            };
295
296            if regex.is_match(&test_path) {
297                matches.push(PatternMatch {
298                    file_path: file_path.clone(),
299                    pattern: original.to_string(),
300                    metadata: self.create_metadata_for_file(file_path),
301                });
302            }
303        }
304
305        matches
306    }
307
308    /// Match regex patterns
309    fn match_regex_pattern(
310        &self,
311        regex: &Regex,
312        file_paths: &[String],
313        original: &str,
314    ) -> Vec<PatternMatch> {
315        let mut matches = Vec::new();
316
317        for file_path in file_paths {
318            if regex.is_match(file_path) {
319                matches.push(PatternMatch {
320                    file_path: file_path.clone(),
321                    pattern: original.to_string(),
322                    metadata: self.create_metadata_for_file(file_path),
323                });
324            }
325        }
326
327        matches
328    }
329
330    /// Match content keys (would need manifest integration)
331    fn match_content_key(&self, _ckey: &str, original: &str) -> Vec<PatternMatch> {
332        // For now, create a placeholder match
333        // In full implementation, would resolve via encoding/root files
334        vec![PatternMatch {
335            file_path: format!("content_key_{_ckey}.data"),
336            pattern: original.to_string(),
337            metadata: MatchMetadata {
338                content_key: Some(_ckey.to_string()),
339                priority_score: 100, // High priority for direct keys
340                ..Default::default()
341            },
342        }]
343    }
344
345    /// Match encoding keys (would need manifest integration)
346    fn match_encoding_key(&self, _ekey: &str, original: &str) -> Vec<PatternMatch> {
347        // For now, create a placeholder match
348        // In full implementation, would resolve via encoding file
349        vec![PatternMatch {
350            file_path: format!("encoding_key_{_ekey}.data"),
351            pattern: original.to_string(),
352            metadata: MatchMetadata {
353                encoding_key: Some(_ekey.to_string()),
354                priority_score: 90, // High priority for direct keys
355                ..Default::default()
356            },
357        }]
358    }
359
360    /// Match exact file paths
361    fn match_file_path(
362        &self,
363        target_path: &str,
364        file_paths: &[String],
365        original: &str,
366    ) -> Vec<PatternMatch> {
367        let mut matches = Vec::new();
368
369        let normalized_target = self.normalize_path(target_path);
370
371        for file_path in file_paths {
372            let normalized_file = self.normalize_path(file_path);
373
374            if normalized_target == normalized_file {
375                matches.push(PatternMatch {
376                    file_path: file_path.clone(),
377                    pattern: original.to_string(),
378                    metadata: self.create_metadata_for_file(file_path),
379                });
380            }
381        }
382
383        matches
384    }
385
386    /// Convert glob pattern to regex
387    fn glob_to_regex(&self, glob: &str) -> String {
388        let mut regex = String::new();
389        let mut chars = glob.chars().peekable();
390
391        regex.push('^');
392
393        while let Some(ch) = chars.next() {
394            match ch {
395                '*' => {
396                    if chars.peek() == Some(&'*') {
397                        chars.next(); // consume second *
398                        if chars.peek() == Some(&'/') {
399                            chars.next(); // consume /
400                            regex.push_str("(?:[^/]+/)*"); // match any number of path segments
401                        } else {
402                            regex.push_str(".*"); // match everything
403                        }
404                    } else {
405                        regex.push_str("[^/]*"); // match everything except path separator
406                    }
407                }
408                '?' => regex.push_str("[^/]"),
409                '[' => {
410                    regex.push('[');
411                    // Copy character class
412                    for ch in chars.by_ref() {
413                        regex.push(ch);
414                        if ch == ']' {
415                            break;
416                        }
417                    }
418                }
419                '{' => {
420                    // Convert {a,b,c} to (a|b|c)
421                    regex.push('(');
422                    for ch in chars.by_ref() {
423                        if ch == '}' {
424                            break;
425                        } else if ch == ',' {
426                            regex.push('|');
427                        } else {
428                            if "^$()[]{}|+.\\".contains(ch) {
429                                regex.push('\\');
430                            }
431                            regex.push(ch);
432                        }
433                    }
434                    regex.push(')');
435                }
436                // Escape regex special characters
437                ch if "^$()[]{}|+.\\".contains(ch) => {
438                    regex.push('\\');
439                    regex.push(ch);
440                }
441                ch => regex.push(ch),
442            }
443        }
444
445        regex.push('$');
446
447        if !self.config.case_sensitive {
448            format!("(?i){regex}")
449        } else {
450            regex
451        }
452    }
453
454    /// Normalize path for comparison
455    fn normalize_path(&self, path: &str) -> String {
456        let mut normalized = path.replace('\\', "/");
457        if !self.config.case_sensitive {
458            normalized = normalized.to_lowercase();
459        }
460        normalized
461    }
462
463    /// Create metadata for a file path
464    fn create_metadata_for_file(&self, file_path: &str) -> MatchMetadata {
465        let file_type = Path::new(file_path)
466            .extension()
467            .and_then(|ext| ext.to_str())
468            .map(|ext| ext.to_lowercase());
469
470        MatchMetadata {
471            file_type,
472            ..Default::default()
473        }
474    }
475
476    /// Calculate priority score for a match
477    fn calculate_priority(&self, pattern_match: &PatternMatch) -> u32 {
478        let mut score = 10; // Base score
479
480        // Boost priority for certain file extensions
481        if let Some(file_type) = &pattern_match.metadata.file_type {
482            if self.config.priority_extensions.contains(file_type) {
483                score += 50;
484            }
485
486            // Additional boosts for specific file types
487            score += match file_type.as_str() {
488                "dbc" | "db2" => 40, // Database files
489                "lua" | "xml" => 30, // Interface files
490                "ogg" | "mp3" => 20, // Audio files
491                "blp" | "tga" => 20, // Image files
492                "m2" | "wmo" => 25,  // 3D models
493                _ => 0,
494            };
495        }
496
497        // Boost for direct key matches
498        if pattern_match.metadata.content_key.is_some() {
499            score += 100;
500        }
501        if pattern_match.metadata.encoding_key.is_some() {
502            score += 90;
503        }
504
505        score
506    }
507
508    /// Get statistics about the compiled patterns
509    pub fn get_stats(&self) -> PatternStats {
510        let mut stats = PatternStats::default();
511
512        for pattern in &self.compiled_patterns {
513            match &pattern.pattern_type {
514                PatternType::Glob(_) => stats.glob_patterns += 1,
515                PatternType::Regex(_) => stats.regex_patterns += 1,
516                PatternType::ContentKey(_) => stats.content_keys += 1,
517                PatternType::EncodingKey(_) => stats.encoding_keys += 1,
518                PatternType::FilePath(_) => stats.file_paths += 1,
519            }
520        }
521
522        stats.total_patterns = self.compiled_patterns.len();
523        stats
524    }
525}
526
527impl Default for PatternExtractor {
528    fn default() -> Self {
529        Self::new()
530    }
531}
532
533/// Statistics about compiled patterns
534#[derive(Debug, Default)]
535pub struct PatternStats {
536    pub total_patterns: usize,
537    pub glob_patterns: usize,
538    pub regex_patterns: usize,
539    pub content_keys: usize,
540    pub encoding_keys: usize,
541    pub file_paths: usize,
542}
543
544#[cfg(test)]
545mod tests {
546    use super::*;
547
548    #[test]
549    fn test_pattern_detection() {
550        let extractor = PatternExtractor::new();
551
552        // Test glob patterns
553        assert!(matches!(
554            extractor.detect_pattern_type("*.dbc").unwrap(),
555            PatternType::Glob(_)
556        ));
557        assert!(matches!(
558            extractor.detect_pattern_type("interface/**/*.lua").unwrap(),
559            PatternType::Glob(_)
560        ));
561
562        // Test regex patterns
563        assert!(matches!(
564            extractor.detect_pattern_type("/sound/.*\\.ogg$/").unwrap(),
565            PatternType::Regex(_)
566        ));
567
568        // Test content key (32 hex characters)
569        assert!(matches!(
570            extractor
571                .detect_pattern_type("0123456789abcdef0123456789abcdef")
572                .unwrap(),
573            PatternType::ContentKey(_)
574        ));
575
576        // Test encoding key (18 hex characters)
577        assert!(matches!(
578            extractor.detect_pattern_type("0123456789abcdef01").unwrap(),
579            PatternType::EncodingKey(_)
580        ));
581
582        // Test file path
583        assert!(matches!(
584            extractor
585                .detect_pattern_type("world/maps/azeroth/azeroth.wdt")
586                .unwrap(),
587            PatternType::FilePath(_)
588        ));
589    }
590
591    #[test]
592    fn test_glob_matching() {
593        let mut extractor = PatternExtractor::new();
594        extractor.add_pattern("*.dbc").unwrap();
595
596        let files = vec![
597            "achievement.dbc".to_string(),
598            "spell.dbc".to_string(),
599            "item.db2".to_string(),
600            "interface/framexml/uiparent.lua".to_string(),
601        ];
602
603        let matches = extractor.match_files(&files);
604        assert_eq!(matches.len(), 2); // Only .dbc files should match
605
606        assert!(matches.iter().any(|m| m.file_path == "achievement.dbc"));
607        assert!(matches.iter().any(|m| m.file_path == "spell.dbc"));
608    }
609
610    #[test]
611    fn test_regex_matching() {
612        let mut extractor = PatternExtractor::new();
613        extractor.add_pattern("/.*\\.lua$/").unwrap();
614
615        let files = vec![
616            "interface/framexml/uiparent.lua".to_string(),
617            "scripts/addon.lua".to_string(),
618            "spell.dbc".to_string(),
619        ];
620
621        let matches = extractor.match_files(&files);
622        assert_eq!(matches.len(), 2); // Only .lua files should match
623    }
624
625    #[test]
626    fn test_glob_to_regex_conversion() {
627        let extractor = PatternExtractor::new();
628
629        assert_eq!(extractor.glob_to_regex("*.dbc"), "(?i)^[^/]*\\.dbc$");
630        assert_eq!(extractor.glob_to_regex("test?.txt"), "(?i)^test[^/]\\.txt$");
631        assert_eq!(
632            extractor.glob_to_regex("**/*.lua"),
633            "(?i)^(?:[^/]+/)*[^/]*\\.lua$"
634        );
635    }
636
637    #[test]
638    fn test_priority_calculation() {
639        let extractor = PatternExtractor::new();
640
641        let dbc_match = PatternMatch {
642            file_path: "spell.dbc".to_string(),
643            pattern: "*.dbc".to_string(),
644            metadata: MatchMetadata {
645                file_type: Some("dbc".to_string()),
646                ..Default::default()
647            },
648        };
649
650        let score = extractor.calculate_priority(&dbc_match);
651        assert!(score > 50); // Should have high priority for .dbc files
652    }
653}