Skip to main content

sbom_tools/matching/
rules.rs

1//! Ecosystem-specific matching rules.
2//!
3//! Provides configurable rules for normalizing and matching package names
4//! according to each ecosystem's conventions.
5
6use crate::model::Ecosystem;
7use regex::Regex;
8use std::collections::HashMap;
9
10use super::ecosystem_config::{
11    ConfigError, EcosystemConfig, EcosystemRulesConfig, NormalizationConfig, ScopeHandling,
12    TyposquatEntry,
13};
14
15/// Ecosystem-specific normalization and matching rules.
16pub struct EcosystemRules {
17    /// Configuration
18    config: EcosystemRulesConfig,
19    /// Compiled regex patterns for suspicious package detection
20    suspicious_patterns: HashMap<String, Vec<Regex>>,
21    /// Compiled regex patterns for group migrations
22    migration_patterns: HashMap<String, Vec<(Regex, String)>>,
23    /// Compiled regex patterns for package group glob members
24    /// Key: ecosystem -> `group_name` -> Vec<Regex>
25    package_group_patterns: HashMap<String, HashMap<String, Vec<Regex>>>,
26}
27
28impl EcosystemRules {
29    /// Create a new ecosystem rules instance with built-in defaults
30    #[must_use]
31    pub fn new() -> Self {
32        Self::with_config(EcosystemRulesConfig::builtin())
33    }
34
35    /// Create ecosystem rules with custom configuration
36    #[must_use]
37    pub fn with_config(config: EcosystemRulesConfig) -> Self {
38        let suspicious_patterns = Self::compile_suspicious_patterns(&config);
39        let migration_patterns = Self::compile_migration_patterns(&config);
40        let package_group_patterns = Self::compile_package_group_patterns(&config);
41
42        Self {
43            config,
44            suspicious_patterns,
45            migration_patterns,
46            package_group_patterns,
47        }
48    }
49
50    /// Load configuration from a file
51    pub fn from_file(path: &std::path::Path) -> Result<Self, ConfigError> {
52        let config = EcosystemRulesConfig::from_file(path)?;
53        Ok(Self::with_config(config))
54    }
55
56    /// Load configuration from default locations with precedence
57    #[must_use]
58    pub fn from_default_locations() -> Self {
59        let config = EcosystemRulesConfig::load_with_precedence(&[
60            ".sbom-tools/ecosystem-rules.yaml",
61            ".sbom-tools/ecosystem-rules.json",
62            "~/.config/sbom-tools/ecosystem-rules.yaml",
63            "~/.config/sbom-tools/ecosystem-rules.json",
64        ])
65        .unwrap_or_else(|_| EcosystemRulesConfig::builtin());
66
67        Self::with_config(config)
68    }
69
70    /// Compile suspicious package name patterns
71    fn compile_suspicious_patterns(config: &EcosystemRulesConfig) -> HashMap<String, Vec<Regex>> {
72        let mut patterns = HashMap::with_capacity(config.ecosystems.len());
73
74        for (ecosystem, eco_config) in &config.ecosystems {
75            let mut compiled = Vec::with_capacity(eco_config.security.suspicious_patterns.len());
76            for pattern in &eco_config.security.suspicious_patterns {
77                if let Ok(re) = Regex::new(pattern) {
78                    compiled.push(re);
79                }
80            }
81            if !compiled.is_empty() {
82                patterns.insert(ecosystem.clone(), compiled);
83            }
84        }
85
86        patterns
87    }
88
89    /// Compile group migration patterns
90    fn compile_migration_patterns(
91        config: &EcosystemRulesConfig,
92    ) -> HashMap<String, Vec<(Regex, String)>> {
93        let mut patterns = HashMap::with_capacity(config.ecosystems.len());
94
95        for (ecosystem, eco_config) in &config.ecosystems {
96            let mut compiled = Vec::with_capacity(eco_config.group_migrations.len());
97            for migration in &eco_config.group_migrations {
98                // Convert glob pattern to regex
99                let regex_pattern = migration.from.replace('.', r"\.").replace('*', ".*");
100                if let Ok(re) = Regex::new(&format!("^{regex_pattern}$")) {
101                    compiled.push((re, migration.to.clone()));
102                }
103            }
104            if !compiled.is_empty() {
105                patterns.insert(ecosystem.clone(), compiled);
106            }
107        }
108
109        patterns
110    }
111
112    /// Compile package group glob patterns for efficient matching
113    fn compile_package_group_patterns(
114        config: &EcosystemRulesConfig,
115    ) -> HashMap<String, HashMap<String, Vec<Regex>>> {
116        let mut eco_patterns = HashMap::with_capacity(config.ecosystems.len());
117
118        for (ecosystem, eco_config) in &config.ecosystems {
119            let mut group_patterns = HashMap::with_capacity(eco_config.package_groups.len());
120
121            for (group_name, group) in &eco_config.package_groups {
122                // Count glob patterns to pre-allocate
123                let glob_count = group.members.iter().filter(|m| m.contains('*')).count();
124                let mut compiled = Vec::with_capacity(glob_count);
125                for member in &group.members {
126                    if member.contains('*') {
127                        // Convert glob pattern to regex
128                        let regex_pattern = member.replace('.', r"\.").replace('*', ".*");
129                        if let Ok(re) = Regex::new(&format!("^{regex_pattern}$")) {
130                            compiled.push(re);
131                        }
132                    }
133                }
134                if !compiled.is_empty() {
135                    group_patterns.insert(group_name.clone(), compiled);
136                }
137            }
138
139            if !group_patterns.is_empty() {
140                eco_patterns.insert(ecosystem.clone(), group_patterns);
141            }
142        }
143
144        eco_patterns
145    }
146
147    /// Get the underlying configuration
148    #[must_use]
149    pub const fn config(&self) -> &EcosystemRulesConfig {
150        &self.config
151    }
152
153    /// Normalize a package name according to ecosystem rules
154    #[must_use]
155    pub fn normalize_name(&self, name: &str, ecosystem: &Ecosystem) -> String {
156        let eco_key = Self::ecosystem_key(ecosystem);
157
158        self.config.ecosystems.get(&eco_key).map_or_else(
159            || {
160                // Fallback to basic normalization
161                name.to_lowercase()
162            },
163            |eco_config| self.apply_normalization(name, eco_config),
164        )
165    }
166
167    /// Apply normalization rules from config
168    fn apply_normalization(&self, name: &str, config: &EcosystemConfig) -> String {
169        let norm = &config.normalization;
170        let mut result = name.to_string();
171
172        // Handle scoped packages (npm @scope/name)
173        if result.starts_with('@') {
174            result = self.normalize_scoped_name(&result, norm);
175        } else {
176            // Apply case sensitivity
177            if !norm.case_sensitive {
178                result = result.to_lowercase();
179            }
180        }
181
182        // Apply character equivalence
183        for char_group in &norm.equivalent_chars {
184            if char_group.len() >= 2 {
185                let target = &char_group[0];
186                for source in &char_group[1..] {
187                    result = result.replace(source.as_str(), target);
188                }
189            }
190        }
191
192        // Collapse separators if enabled
193        if norm.collapse_separators {
194            result = self.collapse_separators(&result);
195        }
196
197        // Strip version suffix for Go modules
198        if norm.strip_version_suffix {
199            result = self.strip_go_version_suffix(&result);
200        }
201
202        result
203    }
204
205    /// Normalize scoped package name (npm @scope/name)
206    fn normalize_scoped_name(&self, name: &str, norm: &NormalizationConfig) -> String {
207        match norm.scope_handling {
208            ScopeHandling::Lowercase => name.to_lowercase(),
209            ScopeHandling::PreserveScopeCase => name.find('/').map_or_else(
210                || name.to_lowercase(),
211                |slash_pos| {
212                    let scope = &name[..slash_pos];
213                    let pkg_name = &name[slash_pos + 1..];
214                    format!("{}/{}", scope.to_lowercase(), pkg_name.to_lowercase())
215                },
216            ),
217            ScopeHandling::PreserveCase => name.to_string(),
218        }
219    }
220
221    /// Collapse repeated separators (hyphens, underscores, dots)
222    fn collapse_separators(&self, name: &str) -> String {
223        let mut result = String::with_capacity(name.len());
224        let mut last_was_sep = false;
225
226        for c in name.chars() {
227            let is_sep = c == '-' || c == '_' || c == '.';
228            if is_sep {
229                if !last_was_sep {
230                    result.push(c);
231                }
232                last_was_sep = true;
233            } else {
234                result.push(c);
235                last_was_sep = false;
236            }
237        }
238
239        // Trim separators from ends
240        result
241            .trim_matches(|c| c == '-' || c == '_' || c == '.')
242            .to_string()
243    }
244
245    /// Strip Go module version suffix (/v2, /v3, etc.)
246    fn strip_go_version_suffix(&self, name: &str) -> String {
247        use std::sync::LazyLock;
248        static GO_VERSION_SUFFIX: LazyLock<Regex> =
249            LazyLock::new(|| Regex::new(r"/v\d+$").expect("static regex"));
250        GO_VERSION_SUFFIX.replace(name, "").to_string()
251    }
252
253    /// Check if two names match according to ecosystem rules
254    #[must_use]
255    pub fn names_match(&self, name_a: &str, name_b: &str, ecosystem: &Ecosystem) -> bool {
256        let norm_a = self.normalize_name(name_a, ecosystem);
257        let norm_b = self.normalize_name(name_b, ecosystem);
258        norm_a == norm_b
259    }
260
261    /// Get the canonical name for an alias
262    #[must_use]
263    pub fn get_canonical(&self, name: &str, ecosystem: &Ecosystem) -> Option<String> {
264        let eco_key = Self::ecosystem_key(ecosystem);
265        let name_lower = name.to_lowercase();
266
267        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
268            for (canonical, aliases) in &eco_config.aliases {
269                if canonical.to_lowercase() == name_lower {
270                    return Some(canonical.clone());
271                }
272                for alias in aliases {
273                    if alias.to_lowercase() == name_lower {
274                        return Some(canonical.clone());
275                    }
276                }
277            }
278        }
279
280        // Check custom equivalences
281        for equiv in &self.config.custom_rules.equivalences {
282            if equiv.canonical.to_lowercase() == name_lower {
283                return Some(equiv.canonical.clone());
284            }
285            for alias in &equiv.aliases {
286                if alias.to_lowercase() == name_lower {
287                    return Some(equiv.canonical.clone());
288                }
289            }
290        }
291
292        None
293    }
294
295    /// Check if a name is an alias of a canonical name
296    #[must_use]
297    pub fn is_alias(&self, canonical: &str, name: &str, ecosystem: &Ecosystem) -> bool {
298        let eco_key = Self::ecosystem_key(ecosystem);
299        let name_lower = name.to_lowercase();
300        let canonical_lower = canonical.to_lowercase();
301
302        if let Some(eco_config) = self.config.ecosystems.get(&eco_key)
303            && let Some(aliases) = eco_config.aliases.get(&canonical_lower)
304        {
305            return aliases.iter().any(|a| a.to_lowercase() == name_lower);
306        }
307
308        false
309    }
310
311    /// Get common suffixes to strip for a given ecosystem
312    #[must_use]
313    pub fn get_strip_suffixes(&self, ecosystem: &Ecosystem) -> Vec<&str> {
314        let eco_key = Self::ecosystem_key(ecosystem);
315
316        self.config
317            .ecosystems
318            .get(&eco_key)
319            .map(|c| {
320                c.strip_suffixes
321                    .iter()
322                    .map(std::string::String::as_str)
323                    .collect()
324            })
325            .unwrap_or_default()
326    }
327
328    /// Get common prefixes to strip for a given ecosystem
329    #[must_use]
330    pub fn get_strip_prefixes(&self, ecosystem: &Ecosystem) -> Vec<&str> {
331        let eco_key = Self::ecosystem_key(ecosystem);
332
333        self.config
334            .ecosystems
335            .get(&eco_key)
336            .map(|c| {
337                c.strip_prefixes
338                    .iter()
339                    .map(std::string::String::as_str)
340                    .collect()
341            })
342            .unwrap_or_default()
343    }
344
345    /// Normalize name by stripping common prefixes/suffixes
346    #[must_use]
347    pub fn strip_affixes(&self, name: &str, ecosystem: &Ecosystem) -> String {
348        let mut result = name.to_lowercase();
349
350        for prefix in self.get_strip_prefixes(ecosystem) {
351            if result.starts_with(prefix) {
352                result = result[prefix.len()..].to_string();
353                break;
354            }
355        }
356
357        for suffix in self.get_strip_suffixes(ecosystem) {
358            if result.ends_with(suffix) {
359                result = result[..result.len() - suffix.len()].to_string();
360                break;
361            }
362        }
363
364        result
365    }
366
367    /// Check if a package name is a known typosquat
368    #[must_use]
369    pub fn is_typosquat(&self, name: &str, ecosystem: &Ecosystem) -> Option<&TyposquatEntry> {
370        if !self.config.settings.enable_security_checks {
371            return None;
372        }
373
374        let eco_key = Self::ecosystem_key(ecosystem);
375        let name_lower = name.to_lowercase();
376
377        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
378            for entry in &eco_config.security.known_typosquats {
379                if entry.malicious.to_lowercase() == name_lower {
380                    return Some(entry);
381                }
382            }
383        }
384
385        None
386    }
387
388    /// Check if a package name matches suspicious patterns
389    #[must_use]
390    pub fn is_suspicious(&self, name: &str, ecosystem: &Ecosystem) -> bool {
391        if !self.config.settings.enable_security_checks {
392            return false;
393        }
394
395        let eco_key = Self::ecosystem_key(ecosystem);
396
397        self.suspicious_patterns
398            .get(&eco_key)
399            .is_some_and(|patterns| patterns.iter().any(|re| re.is_match(name)))
400    }
401
402    /// Check if a package is a known malicious package
403    #[must_use]
404    pub fn is_known_malicious(&self, name: &str, ecosystem: &Ecosystem) -> bool {
405        if !self.config.settings.enable_security_checks {
406            return false;
407        }
408
409        let eco_key = Self::ecosystem_key(ecosystem);
410        let name_lower = name.to_lowercase();
411
412        self.config
413            .ecosystems
414            .get(&eco_key)
415            .is_some_and(|eco_config| {
416                eco_config
417                    .security
418                    .known_malicious
419                    .iter()
420                    .any(|m| m.to_lowercase() == name_lower)
421            })
422    }
423
424    /// Get the migrated group ID (for Maven javax -> jakarta, etc.)
425    #[must_use]
426    pub fn get_migrated_group(&self, group: &str, ecosystem: &Ecosystem) -> Option<String> {
427        let eco_key = Self::ecosystem_key(ecosystem);
428
429        if let Some(patterns) = self.migration_patterns.get(&eco_key) {
430            for (pattern, replacement) in patterns {
431                if pattern.is_match(group) {
432                    let migrated = pattern.replace(group, replacement.as_str());
433                    return Some(migrated.to_string());
434                }
435            }
436        }
437
438        None
439    }
440
441    /// Check if a package is part of a package group
442    #[must_use]
443    pub fn get_package_group(&self, name: &str, ecosystem: &Ecosystem) -> Option<&str> {
444        let eco_key = Self::ecosystem_key(ecosystem);
445        let name_lower = name.to_lowercase();
446
447        if let Some(eco_config) = self.config.ecosystems.get(&eco_key) {
448            // Get pre-compiled patterns for this ecosystem (if any)
449            let compiled_patterns = self.package_group_patterns.get(&eco_key);
450
451            for (group_name, group) in &eco_config.package_groups {
452                // Check canonical
453                if group.canonical.to_lowercase() == name_lower {
454                    return Some(group_name);
455                }
456
457                // Check members using pre-compiled patterns for globs
458                for member in &group.members {
459                    if member.contains('*') {
460                        // Use pre-compiled pattern
461                        if let Some(group_patterns) = compiled_patterns
462                            && let Some(patterns) = group_patterns.get(group_name)
463                            && patterns.iter().any(|re| re.is_match(&name_lower))
464                        {
465                            return Some(group_name);
466                        }
467                    } else if member.to_lowercase() == name_lower {
468                        return Some(group_name);
469                    }
470                }
471            }
472        }
473
474        None
475    }
476
477    /// Get cross-ecosystem equivalent package
478    #[must_use]
479    pub fn get_cross_ecosystem_equivalent(
480        &self,
481        concept: &str,
482        target_ecosystem: &Ecosystem,
483    ) -> Option<&str> {
484        let eco_key = Self::ecosystem_key(target_ecosystem);
485
486        self.config
487            .cross_ecosystem
488            .get(concept)
489            .and_then(|mapping| mapping.get(&eco_key))
490            .and_then(|opt| opt.as_deref())
491    }
492
493    /// Check if a package is an internal/organization package
494    #[must_use]
495    pub fn is_internal_package(&self, name: &str) -> bool {
496        self.config
497            .custom_rules
498            .internal_prefixes
499            .iter()
500            .any(|prefix| name.starts_with(prefix))
501    }
502
503    /// Check if a package should be ignored in diffs
504    #[must_use]
505    pub fn is_ignored(&self, name: &str) -> bool {
506        let name_lower = name.to_lowercase();
507        self.config
508            .custom_rules
509            .ignored_packages
510            .iter()
511            .any(|p| p.to_lowercase() == name_lower)
512    }
513
514    /// Convert Ecosystem enum to string key
515    fn ecosystem_key(ecosystem: &Ecosystem) -> String {
516        match ecosystem {
517            Ecosystem::Npm => "npm".to_string(),
518            Ecosystem::PyPi => "pypi".to_string(),
519            Ecosystem::Cargo => "cargo".to_string(),
520            Ecosystem::Maven => "maven".to_string(),
521            Ecosystem::Golang => "golang".to_string(),
522            Ecosystem::Nuget => "nuget".to_string(),
523            Ecosystem::RubyGems => "rubygems".to_string(),
524            Ecosystem::Composer => "composer".to_string(),
525            Ecosystem::CocoaPods => "cocoapods".to_string(),
526            Ecosystem::Swift => "swift".to_string(),
527            Ecosystem::Hex => "hex".to_string(),
528            Ecosystem::Pub => "pub".to_string(),
529            Ecosystem::Hackage => "hackage".to_string(),
530            Ecosystem::Cpan => "cpan".to_string(),
531            Ecosystem::Cran => "cran".to_string(),
532            Ecosystem::Conda => "conda".to_string(),
533            Ecosystem::Conan => "conan".to_string(),
534            Ecosystem::Deb => "deb".to_string(),
535            Ecosystem::Rpm => "rpm".to_string(),
536            Ecosystem::Apk => "apk".to_string(),
537            Ecosystem::Generic => "generic".to_string(),
538            Ecosystem::Unknown(s) => s.to_lowercase(),
539        }
540    }
541}
542
543impl Default for EcosystemRules {
544    fn default() -> Self {
545        Self::new()
546    }
547}
548
549#[cfg(test)]
550mod tests {
551    use super::*;
552
553    #[test]
554    fn test_pypi_normalization() {
555        let rules = EcosystemRules::new();
556
557        assert_eq!(
558            rules.normalize_name("python-dateutil", &Ecosystem::PyPi),
559            "python-dateutil"
560        );
561        assert_eq!(
562            rules.normalize_name("python_dateutil", &Ecosystem::PyPi),
563            "python-dateutil"
564        );
565        assert_eq!(
566            rules.normalize_name("Python.Dateutil", &Ecosystem::PyPi),
567            "python-dateutil"
568        );
569    }
570
571    #[test]
572    fn test_cargo_normalization() {
573        let rules = EcosystemRules::new();
574
575        assert_eq!(
576            rules.normalize_name("serde-json", &Ecosystem::Cargo),
577            "serde_json"
578        );
579        assert_eq!(
580            rules.normalize_name("serde_json", &Ecosystem::Cargo),
581            "serde_json"
582        );
583    }
584
585    #[test]
586    fn test_npm_scoped_normalization() {
587        let rules = EcosystemRules::new();
588
589        assert_eq!(
590            rules.normalize_name("@Angular/Core", &Ecosystem::Npm),
591            "@angular/core"
592        );
593    }
594
595    #[test]
596    fn test_names_match() {
597        let rules = EcosystemRules::new();
598
599        assert!(rules.names_match("python-dateutil", "python_dateutil", &Ecosystem::PyPi));
600        assert!(rules.names_match("serde-json", "serde_json", &Ecosystem::Cargo));
601    }
602
603    #[test]
604    fn test_strip_affixes() {
605        let rules = EcosystemRules::new();
606
607        assert_eq!(
608            rules.strip_affixes("python-requests", &Ecosystem::PyPi),
609            "requests"
610        );
611        assert_eq!(rules.strip_affixes("lodash-js", &Ecosystem::Npm), "lodash");
612    }
613
614    #[test]
615    fn test_typosquat_detection() {
616        let rules = EcosystemRules::new();
617
618        let result = rules.is_typosquat("python-dateutils", &Ecosystem::PyPi);
619        assert!(result.is_some());
620        assert_eq!(result.unwrap().legitimate, "python-dateutil");
621
622        assert!(rules.is_typosquat("requests", &Ecosystem::PyPi).is_none());
623    }
624
625    #[test]
626    fn test_package_group() {
627        let rules = EcosystemRules::new();
628
629        assert_eq!(
630            rules.get_package_group("lodash-es", &Ecosystem::Npm),
631            Some("lodash")
632        );
633        assert_eq!(
634            rules.get_package_group("lodash", &Ecosystem::Npm),
635            Some("lodash")
636        );
637    }
638
639    #[test]
640    fn test_cross_ecosystem() {
641        let rules = EcosystemRules::new();
642
643        assert_eq!(
644            rules.get_cross_ecosystem_equivalent("yaml_parsing", &Ecosystem::PyPi),
645            Some("pyyaml")
646        );
647        assert_eq!(
648            rules.get_cross_ecosystem_equivalent("yaml_parsing", &Ecosystem::Npm),
649            Some("js-yaml")
650        );
651    }
652
653    #[test]
654    fn test_go_version_suffix() {
655        let rules = EcosystemRules::new();
656
657        assert_eq!(
658            rules.normalize_name("github.com/foo/bar/v2", &Ecosystem::Golang),
659            "github.com/foo/bar"
660        );
661        assert_eq!(
662            rules.normalize_name("github.com/foo/bar", &Ecosystem::Golang),
663            "github.com/foo/bar"
664        );
665    }
666
667    #[test]
668    fn test_canonical_lookup() {
669        let rules = EcosystemRules::new();
670
671        assert_eq!(
672            rules.get_canonical("PIL", &Ecosystem::PyPi),
673            Some("pillow".to_string())
674        );
675        assert_eq!(
676            rules.get_canonical("sklearn", &Ecosystem::PyPi),
677            Some("scikit-learn".to_string())
678        );
679    }
680
681    #[test]
682    fn test_custom_config() {
683        let yaml = r#"
684version: "1.0"
685custom_rules:
686  internal_prefixes:
687    - "@mycompany/"
688  ignored_packages:
689    - "internal-tool"
690"#;
691        let config = EcosystemRulesConfig::from_yaml(yaml).unwrap();
692        let rules = EcosystemRules::with_config(config);
693
694        assert!(rules.is_internal_package("@mycompany/logger"));
695        assert!(!rules.is_internal_package("lodash"));
696        assert!(rules.is_ignored("internal-tool"));
697    }
698}