Skip to main content

sbom_tools/matching/
rules.rs

1//! Ecosystem-specific matching rules.
2//!
3//! Provides configurable rules for normalizing and matching package names
4//! according to each ecosystem's conventions.
5
6use crate::model::Ecosystem;
7use regex::Regex;
8use std::collections::HashMap;
9
10use super::ecosystem_config::{
11    ConfigError, EcosystemConfig, EcosystemRulesConfig, NormalizationConfig, ScopeHandling,
12    TyposquatEntry,
13};
14
15/// Ecosystem-specific normalization and matching rules.
16pub struct EcosystemRules {
17    /// Configuration
18    config: EcosystemRulesConfig,
19    /// Compiled regex patterns for suspicious package detection
20    suspicious_patterns: HashMap<String, Vec<Regex>>,
21    /// Compiled regex patterns for group migrations
22    migration_patterns: HashMap<String, Vec<(Regex, String)>>,
23    /// Compiled regex patterns for package group glob members
24    /// Key: ecosystem -> group_name -> Vec<Regex>
25    package_group_patterns: HashMap<String, HashMap<String, Vec<Regex>>>,
26}
27
28impl EcosystemRules {
29    /// Create a new ecosystem rules instance with built-in defaults
30    pub fn new() -> Self {
31        Self::with_config(EcosystemRulesConfig::builtin())
32    }
33
34    /// Create ecosystem rules with custom configuration
35    pub fn with_config(config: EcosystemRulesConfig) -> Self {
36        let suspicious_patterns = Self::compile_suspicious_patterns(&config);
37        let migration_patterns = Self::compile_migration_patterns(&config);
38        let package_group_patterns = Self::compile_package_group_patterns(&config);
39
40        Self {
41            config,
42            suspicious_patterns,
43            migration_patterns,
44            package_group_patterns,
45        }
46    }
47
48    /// Load configuration from a file
49    pub fn from_file(path: &std::path::Path) -> Result<Self, ConfigError> {
50        let config = EcosystemRulesConfig::from_file(path)?;
51        Ok(Self::with_config(config))
52    }
53
54    /// Load configuration from default locations with precedence
55    pub fn from_default_locations() -> Self {
56        let config = EcosystemRulesConfig::load_with_precedence(&[
57            ".sbom-tools/ecosystem-rules.yaml",
58            ".sbom-tools/ecosystem-rules.json",
59            "~/.config/sbom-tools/ecosystem-rules.yaml",
60            "~/.config/sbom-tools/ecosystem-rules.json",
61        ])
62        .unwrap_or_else(|_| EcosystemRulesConfig::builtin());
63
64        Self::with_config(config)
65    }
66
67    /// Compile suspicious package name patterns
68    fn compile_suspicious_patterns(config: &EcosystemRulesConfig) -> HashMap<String, Vec<Regex>> {
69        let mut patterns = HashMap::with_capacity(config.ecosystems.len());
70
71        for (ecosystem, eco_config) in &config.ecosystems {
72            let mut compiled = Vec::with_capacity(eco_config.security.suspicious_patterns.len());
73            for pattern in &eco_config.security.suspicious_patterns {
74                if let Ok(re) = Regex::new(pattern) {
75                    compiled.push(re);
76                }
77            }
78            if !compiled.is_empty() {
79                patterns.insert(ecosystem.clone(), compiled);
80            }
81        }
82
83        patterns
84    }
85
86    /// Compile group migration patterns
87    fn compile_migration_patterns(
88        config: &EcosystemRulesConfig,
89    ) -> HashMap<String, Vec<(Regex, String)>> {
90        let mut patterns = HashMap::with_capacity(config.ecosystems.len());
91
92        for (ecosystem, eco_config) in &config.ecosystems {
93            let mut compiled = Vec::with_capacity(eco_config.group_migrations.len());
94            for migration in &eco_config.group_migrations {
95                // Convert glob pattern to regex
96                let regex_pattern = migration.from.replace('.', r"\.").replace('*', ".*");
97                if let Ok(re) = Regex::new(&format!("^{}$", regex_pattern)) {
98                    compiled.push((re, migration.to.clone()));
99                }
100            }
101            if !compiled.is_empty() {
102                patterns.insert(ecosystem.clone(), compiled);
103            }
104        }
105
106        patterns
107    }
108
109    /// Compile package group glob patterns for efficient matching
110    fn compile_package_group_patterns(
111        config: &EcosystemRulesConfig,
112    ) -> HashMap<String, HashMap<String, Vec<Regex>>> {
113        let mut eco_patterns = HashMap::with_capacity(config.ecosystems.len());
114
115        for (ecosystem, eco_config) in &config.ecosystems {
116            let mut group_patterns = HashMap::with_capacity(eco_config.package_groups.len());
117
118            for (group_name, group) in &eco_config.package_groups {
119                // Count glob patterns to pre-allocate
120                let glob_count = group.members.iter().filter(|m| m.contains('*')).count();
121                let mut compiled = Vec::with_capacity(glob_count);
122                for member in &group.members {
123                    if member.contains('*') {
124                        // Convert glob pattern to regex
125                        let regex_pattern = member.replace('.', r"\.").replace('*', ".*");
126                        if let Ok(re) = Regex::new(&format!("^{}$", regex_pattern)) {
127                            compiled.push(re);
128                        }
129                    }
130                }
131                if !compiled.is_empty() {
132                    group_patterns.insert(group_name.clone(), compiled);
133                }
134            }
135
136            if !group_patterns.is_empty() {
137                eco_patterns.insert(ecosystem.clone(), group_patterns);
138            }
139        }
140
141        eco_patterns
142    }
143
144    /// Get the underlying configuration
145    pub fn config(&self) -> &EcosystemRulesConfig {
146        &self.config
147    }
148
149    /// Normalize a package name according to ecosystem rules
150    pub fn normalize_name(&self, name: &str, ecosystem: &Ecosystem) -> String {
151        let eco_key = Self::ecosystem_key(ecosystem);
152
153        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
154            self.apply_normalization(name, eco_config)
155        } else {
156            // Fallback to basic normalization
157            name.to_lowercase()
158        }
159    }
160
161    /// Apply normalization rules from config
162    fn apply_normalization(&self, name: &str, config: &EcosystemConfig) -> String {
163        let norm = &config.normalization;
164        let mut result = name.to_string();
165
166        // Handle scoped packages (npm @scope/name)
167        if result.starts_with('@') {
168            result = self.normalize_scoped_name(&result, norm);
169        } else {
170            // Apply case sensitivity
171            if !norm.case_sensitive {
172                result = result.to_lowercase();
173            }
174        }
175
176        // Apply character equivalence
177        for char_group in &norm.equivalent_chars {
178            if char_group.len() >= 2 {
179                let target = &char_group[0];
180                for source in &char_group[1..] {
181                    result = result.replace(source.as_str(), target);
182                }
183            }
184        }
185
186        // Collapse separators if enabled
187        if norm.collapse_separators {
188            result = self.collapse_separators(&result);
189        }
190
191        // Strip version suffix for Go modules
192        if norm.strip_version_suffix {
193            result = self.strip_go_version_suffix(&result);
194        }
195
196        result
197    }
198
199    /// Normalize scoped package name (npm @scope/name)
200    fn normalize_scoped_name(&self, name: &str, norm: &NormalizationConfig) -> String {
201        match norm.scope_handling {
202            ScopeHandling::Lowercase => name.to_lowercase(),
203            ScopeHandling::PreserveScopeCase => {
204                if let Some(slash_pos) = name.find('/') {
205                    let scope = &name[..slash_pos];
206                    let pkg_name = &name[slash_pos + 1..];
207                    format!("{}/{}", scope.to_lowercase(), pkg_name.to_lowercase())
208                } else {
209                    name.to_lowercase()
210                }
211            }
212            ScopeHandling::PreserveCase => name.to_string(),
213        }
214    }
215
216    /// Collapse repeated separators (hyphens, underscores, dots)
217    fn collapse_separators(&self, name: &str) -> String {
218        let mut result = String::with_capacity(name.len());
219        let mut last_was_sep = false;
220
221        for c in name.chars() {
222            let is_sep = c == '-' || c == '_' || c == '.';
223            if is_sep {
224                if !last_was_sep {
225                    result.push(c);
226                }
227                last_was_sep = true;
228            } else {
229                result.push(c);
230                last_was_sep = false;
231            }
232        }
233
234        // Trim separators from ends
235        result
236            .trim_matches(|c| c == '-' || c == '_' || c == '.')
237            .to_string()
238    }
239
240    /// Strip Go module version suffix (/v2, /v3, etc.)
241    fn strip_go_version_suffix(&self, name: &str) -> String {
242        use std::sync::LazyLock;
243        static GO_VERSION_SUFFIX: LazyLock<Regex> =
244            LazyLock::new(|| Regex::new(r"/v\d+$").expect("static regex"));
245        GO_VERSION_SUFFIX.replace(name, "").to_string()
246    }
247
248    /// Check if two names match according to ecosystem rules
249    pub fn names_match(&self, name_a: &str, name_b: &str, ecosystem: &Ecosystem) -> bool {
250        let norm_a = self.normalize_name(name_a, ecosystem);
251        let norm_b = self.normalize_name(name_b, ecosystem);
252        norm_a == norm_b
253    }
254
255    /// Get the canonical name for an alias
256    pub fn get_canonical(&self, name: &str, ecosystem: &Ecosystem) -> Option<String> {
257        let eco_key = Self::ecosystem_key(ecosystem);
258        let name_lower = name.to_lowercase();
259
260        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
261            for (canonical, aliases) in &eco_config.aliases {
262                if canonical.to_lowercase() == name_lower {
263                    return Some(canonical.clone());
264                }
265                for alias in aliases {
266                    if alias.to_lowercase() == name_lower {
267                        return Some(canonical.clone());
268                    }
269                }
270            }
271        }
272
273        // Check custom equivalences
274        for equiv in &self.config.custom_rules.equivalences {
275            if equiv.canonical.to_lowercase() == name_lower {
276                return Some(equiv.canonical.clone());
277            }
278            for alias in &equiv.aliases {
279                if alias.to_lowercase() == name_lower {
280                    return Some(equiv.canonical.clone());
281                }
282            }
283        }
284
285        None
286    }
287
288    /// Check if a name is an alias of a canonical name
289    pub fn is_alias(&self, canonical: &str, name: &str, ecosystem: &Ecosystem) -> bool {
290        let eco_key = Self::ecosystem_key(ecosystem);
291        let name_lower = name.to_lowercase();
292        let canonical_lower = canonical.to_lowercase();
293
294        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
295            if let Some(aliases) = eco_config.aliases.get(&canonical_lower) {
296                return aliases.iter().any(|a| a.to_lowercase() == name_lower);
297            }
298        }
299
300        false
301    }
302
303    /// Get common suffixes to strip for a given ecosystem
304    pub fn get_strip_suffixes(&self, ecosystem: &Ecosystem) -> Vec<&str> {
305        let eco_key = Self::ecosystem_key(ecosystem);
306
307        self.config
308            .ecosystems
309            .get(&eco_key)
310            .map(|c| c.strip_suffixes.iter().map(|s| s.as_str()).collect())
311            .unwrap_or_default()
312    }
313
314    /// Get common prefixes to strip for a given ecosystem
315    pub fn get_strip_prefixes(&self, ecosystem: &Ecosystem) -> Vec<&str> {
316        let eco_key = Self::ecosystem_key(ecosystem);
317
318        self.config
319            .ecosystems
320            .get(&eco_key)
321            .map(|c| c.strip_prefixes.iter().map(|s| s.as_str()).collect())
322            .unwrap_or_default()
323    }
324
325    /// Normalize name by stripping common prefixes/suffixes
326    pub fn strip_affixes(&self, name: &str, ecosystem: &Ecosystem) -> String {
327        let mut result = name.to_lowercase();
328
329        for prefix in self.get_strip_prefixes(ecosystem) {
330            if result.starts_with(prefix) {
331                result = result[prefix.len()..].to_string();
332                break;
333            }
334        }
335
336        for suffix in self.get_strip_suffixes(ecosystem) {
337            if result.ends_with(suffix) {
338                result = result[..result.len() - suffix.len()].to_string();
339                break;
340            }
341        }
342
343        result
344    }
345
346    /// Check if a package name is a known typosquat
347    pub fn is_typosquat(&self, name: &str, ecosystem: &Ecosystem) -> Option<&TyposquatEntry> {
348        if !self.config.settings.enable_security_checks {
349            return None;
350        }
351
352        let eco_key = Self::ecosystem_key(ecosystem);
353        let name_lower = name.to_lowercase();
354
355        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
356            for entry in &eco_config.security.known_typosquats {
357                if entry.malicious.to_lowercase() == name_lower {
358                    return Some(entry);
359                }
360            }
361        }
362
363        None
364    }
365
366    /// Check if a package name matches suspicious patterns
367    pub fn is_suspicious(&self, name: &str, ecosystem: &Ecosystem) -> bool {
368        if !self.config.settings.enable_security_checks {
369            return false;
370        }
371
372        let eco_key = Self::ecosystem_key(ecosystem);
373
374        if let Some(patterns) = self.suspicious_patterns.get(&eco_key) {
375            patterns.iter().any(|re| re.is_match(name))
376        } else {
377            false
378        }
379    }
380
381    /// Check if a package is a known malicious package
382    pub fn is_known_malicious(&self, name: &str, ecosystem: &Ecosystem) -> bool {
383        if !self.config.settings.enable_security_checks {
384            return false;
385        }
386
387        let eco_key = Self::ecosystem_key(ecosystem);
388        let name_lower = name.to_lowercase();
389
390        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
391            eco_config
392                .security
393                .known_malicious
394                .iter()
395                .any(|m| m.to_lowercase() == name_lower)
396        } else {
397            false
398        }
399    }
400
401    /// Get the migrated group ID (for Maven javax -> jakarta, etc.)
402    pub fn get_migrated_group(&self, group: &str, ecosystem: &Ecosystem) -> Option<String> {
403        let eco_key = Self::ecosystem_key(ecosystem);
404
405        if let Some(patterns) = self.migration_patterns.get(&eco_key) {
406            for (pattern, replacement) in patterns {
407                if pattern.is_match(group) {
408                    let migrated = pattern.replace(group, replacement.as_str());
409                    return Some(migrated.to_string());
410                }
411            }
412        }
413
414        None
415    }
416
417    /// Check if a package is part of a package group
418    pub fn get_package_group(&self, name: &str, ecosystem: &Ecosystem) -> Option<&str> {
419        let eco_key = Self::ecosystem_key(ecosystem);
420        let name_lower = name.to_lowercase();
421
422        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
423            // Get pre-compiled patterns for this ecosystem (if any)
424            let compiled_patterns = self.package_group_patterns.get(&eco_key);
425
426            for (group_name, group) in &eco_config.package_groups {
427                // Check canonical
428                if group.canonical.to_lowercase() == name_lower {
429                    return Some(group_name);
430                }
431
432                // Check members using pre-compiled patterns for globs
433                for member in &group.members {
434                    if member.contains('*') {
435                        // Use pre-compiled pattern
436                        if let Some(group_patterns) = compiled_patterns {
437                            if let Some(patterns) = group_patterns.get(group_name) {
438                                if patterns.iter().any(|re| re.is_match(&name_lower)) {
439                                    return Some(group_name);
440                                }
441                            }
442                        }
443                    } else if member.to_lowercase() == name_lower {
444                        return Some(group_name);
445                    }
446                }
447            }
448        }
449
450        None
451    }
452
453    /// Get cross-ecosystem equivalent package
454    pub fn get_cross_ecosystem_equivalent(
455        &self,
456        concept: &str,
457        target_ecosystem: &Ecosystem,
458    ) -> Option<&str> {
459        let eco_key = Self::ecosystem_key(target_ecosystem);
460
461        self.config
462            .cross_ecosystem
463            .get(concept)
464            .and_then(|mapping| mapping.get(&eco_key))
465            .and_then(|opt| opt.as_deref())
466    }
467
468    /// Check if a package is an internal/organization package
469    pub fn is_internal_package(&self, name: &str) -> bool {
470        self.config
471            .custom_rules
472            .internal_prefixes
473            .iter()
474            .any(|prefix| name.starts_with(prefix))
475    }
476
477    /// Check if a package should be ignored in diffs
478    pub fn is_ignored(&self, name: &str) -> bool {
479        let name_lower = name.to_lowercase();
480        self.config
481            .custom_rules
482            .ignored_packages
483            .iter()
484            .any(|p| p.to_lowercase() == name_lower)
485    }
486
487    /// Convert Ecosystem enum to string key
488    fn ecosystem_key(ecosystem: &Ecosystem) -> String {
489        match ecosystem {
490            Ecosystem::Npm => "npm".to_string(),
491            Ecosystem::PyPi => "pypi".to_string(),
492            Ecosystem::Cargo => "cargo".to_string(),
493            Ecosystem::Maven => "maven".to_string(),
494            Ecosystem::Golang => "golang".to_string(),
495            Ecosystem::Nuget => "nuget".to_string(),
496            Ecosystem::RubyGems => "rubygems".to_string(),
497            Ecosystem::Composer => "composer".to_string(),
498            Ecosystem::CocoaPods => "cocoapods".to_string(),
499            Ecosystem::Swift => "swift".to_string(),
500            Ecosystem::Hex => "hex".to_string(),
501            Ecosystem::Pub => "pub".to_string(),
502            Ecosystem::Hackage => "hackage".to_string(),
503            Ecosystem::Cpan => "cpan".to_string(),
504            Ecosystem::Cran => "cran".to_string(),
505            Ecosystem::Conda => "conda".to_string(),
506            Ecosystem::Conan => "conan".to_string(),
507            Ecosystem::Deb => "deb".to_string(),
508            Ecosystem::Rpm => "rpm".to_string(),
509            Ecosystem::Apk => "apk".to_string(),
510            Ecosystem::Generic => "generic".to_string(),
511            Ecosystem::Unknown(s) => s.to_lowercase(),
512        }
513    }
514}
515
516impl Default for EcosystemRules {
517    fn default() -> Self {
518        Self::new()
519    }
520}
521
522#[cfg(test)]
523mod tests {
524    use super::*;
525
526    #[test]
527    fn test_pypi_normalization() {
528        let rules = EcosystemRules::new();
529
530        assert_eq!(
531            rules.normalize_name("python-dateutil", &Ecosystem::PyPi),
532            "python-dateutil"
533        );
534        assert_eq!(
535            rules.normalize_name("python_dateutil", &Ecosystem::PyPi),
536            "python-dateutil"
537        );
538        assert_eq!(
539            rules.normalize_name("Python.Dateutil", &Ecosystem::PyPi),
540            "python-dateutil"
541        );
542    }
543
544    #[test]
545    fn test_cargo_normalization() {
546        let rules = EcosystemRules::new();
547
548        assert_eq!(
549            rules.normalize_name("serde-json", &Ecosystem::Cargo),
550            "serde_json"
551        );
552        assert_eq!(
553            rules.normalize_name("serde_json", &Ecosystem::Cargo),
554            "serde_json"
555        );
556    }
557
558    #[test]
559    fn test_npm_scoped_normalization() {
560        let rules = EcosystemRules::new();
561
562        assert_eq!(
563            rules.normalize_name("@Angular/Core", &Ecosystem::Npm),
564            "@angular/core"
565        );
566    }
567
568    #[test]
569    fn test_names_match() {
570        let rules = EcosystemRules::new();
571
572        assert!(rules.names_match("python-dateutil", "python_dateutil", &Ecosystem::PyPi));
573        assert!(rules.names_match("serde-json", "serde_json", &Ecosystem::Cargo));
574    }
575
576    #[test]
577    fn test_strip_affixes() {
578        let rules = EcosystemRules::new();
579
580        assert_eq!(
581            rules.strip_affixes("python-requests", &Ecosystem::PyPi),
582            "requests"
583        );
584        assert_eq!(rules.strip_affixes("lodash-js", &Ecosystem::Npm), "lodash");
585    }
586
587    #[test]
588    fn test_typosquat_detection() {
589        let rules = EcosystemRules::new();
590
591        let result = rules.is_typosquat("python-dateutils", &Ecosystem::PyPi);
592        assert!(result.is_some());
593        assert_eq!(result.unwrap().legitimate, "python-dateutil");
594
595        assert!(rules.is_typosquat("requests", &Ecosystem::PyPi).is_none());
596    }
597
598    #[test]
599    fn test_package_group() {
600        let rules = EcosystemRules::new();
601
602        assert_eq!(
603            rules.get_package_group("lodash-es", &Ecosystem::Npm),
604            Some("lodash")
605        );
606        assert_eq!(
607            rules.get_package_group("lodash", &Ecosystem::Npm),
608            Some("lodash")
609        );
610    }
611
612    #[test]
613    fn test_cross_ecosystem() {
614        let rules = EcosystemRules::new();
615
616        assert_eq!(
617            rules.get_cross_ecosystem_equivalent("yaml_parsing", &Ecosystem::PyPi),
618            Some("pyyaml")
619        );
620        assert_eq!(
621            rules.get_cross_ecosystem_equivalent("yaml_parsing", &Ecosystem::Npm),
622            Some("js-yaml")
623        );
624    }
625
626    #[test]
627    fn test_go_version_suffix() {
628        let rules = EcosystemRules::new();
629
630        assert_eq!(
631            rules.normalize_name("github.com/foo/bar/v2", &Ecosystem::Golang),
632            "github.com/foo/bar"
633        );
634        assert_eq!(
635            rules.normalize_name("github.com/foo/bar", &Ecosystem::Golang),
636            "github.com/foo/bar"
637        );
638    }
639
640    #[test]
641    fn test_canonical_lookup() {
642        let rules = EcosystemRules::new();
643
644        assert_eq!(
645            rules.get_canonical("PIL", &Ecosystem::PyPi),
646            Some("pillow".to_string())
647        );
648        assert_eq!(
649            rules.get_canonical("sklearn", &Ecosystem::PyPi),
650            Some("scikit-learn".to_string())
651        );
652    }
653
654    #[test]
655    fn test_custom_config() {
656        let yaml = r#"
657version: "1.0"
658custom_rules:
659  internal_prefixes:
660    - "@mycompany/"
661  ignored_packages:
662    - "internal-tool"
663"#;
664        let config = EcosystemRulesConfig::from_yaml(yaml).unwrap();
665        let rules = EcosystemRules::with_config(config);
666
667        assert!(rules.is_internal_package("@mycompany/logger"));
668        assert!(!rules.is_internal_package("lodash"));
669        assert!(rules.is_ignored("internal-tool"));
670    }
671}