sbom_tools/matching/
mod.rs

1//! Fuzzy matching engine for cross-ecosystem package correlation.
2//!
3//! This module provides multi-tier matching strategies for correlating
4//! components across different ecosystems and naming conventions.
5//!
6//! # Architecture
7//!
8//! The matching system is built on the [`ComponentMatcher`] trait, which
9//! provides a pluggable interface for different matching strategies:
10//!
11//! - [`FuzzyMatcher`]: Multi-tier fuzzy matching (default)
12//! - [`CompositeMatcher`]: Combines multiple matchers
13//! - [`CachedMatcher`]: Wraps any matcher with caching
14//!
15//! # Example
16//!
17//! ```ignore
18//! use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
19//!
20//! // Use the trait for dependency injection
21//! fn diff_with_matcher(matcher: &dyn ComponentMatcher) {
22//!     let score = matcher.match_score(&comp_a, &comp_b);
23//! }
24//!
25//! let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
26//! diff_with_matcher(&matcher);
27//! ```
28
29pub mod adaptive;
30mod aliases;
31mod config;
32pub mod cross_ecosystem;
33pub mod custom_rules;
34pub mod ecosystem_config;
35pub mod index;
36pub mod lsh;
37mod purl;
38pub mod rule_engine;
39mod rules;
40mod traits;
41
42pub use adaptive::{
43    AdaptiveMatching, AdaptiveMethod, AdaptiveThreshold, AdaptiveThresholdConfig,
44    AdaptiveThresholdResult, ScoreStats,
45};
46pub use aliases::AliasTable;
47pub use config::{CrossEcosystemConfig, FuzzyMatchConfig, MultiFieldWeights};
48pub use cross_ecosystem::{CrossEcosystemDb, CrossEcosystemMatch, PackageFamily};
49pub use custom_rules::{
50    AliasPattern, EquivalenceGroup, ExclusionRule, MatchingRulesConfig, RulePrecedence,
51    RulesSummary,
52};
53pub use ecosystem_config::{
54    ConfigError, CustomEquivalence, CustomRules, EcosystemConfig, EcosystemRulesConfig,
55    GlobalSettings, GroupMigration, ImportMapping, NormalizationConfig, PackageGroup,
56    ScopeHandling, SecurityConfig, TyposquatEntry, VersionSpec, VersioningConfig,
57};
58pub use index::{
59    BatchCandidateConfig, BatchCandidateGenerator, BatchCandidateResult, BatchCandidateStats,
60    ComponentIndex, IndexStats, LazyComponentIndex, NormalizedEntry,
61};
62pub use lsh::{LshConfig, LshIndex, LshIndexStats, MinHashSignature};
63pub use purl::PurlNormalizer;
64pub use rule_engine::{AppliedRule, AppliedRuleType, RuleApplicationResult, RuleEngine};
65pub use rules::EcosystemRules;
66pub use traits::{
67    CacheConfig, CacheStats, CachedMatcher, ComponentMatcher, CompositeMatcher,
68    CompositeMatcherBuilder, MatchExplanation, MatchMetadata, MatchResult, MatchTier,
69    ScoreComponent,
70};
71
72use crate::model::Component;
73use strsim::{jaro_winkler, levenshtein};
74
75/// Fuzzy matcher for component correlation.
76pub struct FuzzyMatcher {
77    config: FuzzyMatchConfig,
78    alias_table: AliasTable,
79    purl_normalizer: PurlNormalizer,
80    ecosystem_rules: EcosystemRules,
81}
82
83impl FuzzyMatcher {
84    /// Create a new fuzzy matcher with the given configuration
85    pub fn new(config: FuzzyMatchConfig) -> Self {
86        Self {
87            config,
88            alias_table: AliasTable::default(),
89            purl_normalizer: PurlNormalizer::new(),
90            ecosystem_rules: EcosystemRules::new(),
91        }
92    }
93
94    /// Get the current configuration.
95    pub fn config(&self) -> &FuzzyMatchConfig {
96        &self.config
97    }
98
99    /// Create a matcher with a custom alias table
100    pub fn with_alias_table(mut self, table: AliasTable) -> Self {
101        self.alias_table = table;
102        self
103    }
104
105    /// Match two components and return a confidence score (0.0 - 1.0)
106    pub fn match_components(&self, a: &Component, b: &Component) -> f64 {
107        // Layer 1: Exact PURL match
108        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
109            let norm_a = self.purl_normalizer.normalize(purl_a);
110            let norm_b = self.purl_normalizer.normalize(purl_b);
111            if norm_a == norm_b {
112                return 1.0;
113            }
114        }
115
116        // Layer 2: Alias table lookup
117        if self.check_alias_match(a, b) {
118            return 0.95;
119        }
120
121        // Layer 3: Rule-based ecosystem normalization
122        if let Some(score) = self.check_ecosystem_rules(a, b) {
123            if score >= 0.90 {
124                return score;
125            }
126        }
127
128        // Layer 4: Multi-field weighted scoring (if configured) or fuzzy string similarity
129        if let Some(ref weights) = self.config.field_weights {
130            // Use multi-field scoring when configured
131            let result = self.compute_multi_field_score(a, b, weights);
132            if result.total >= self.config.threshold {
133                return result.total;
134            }
135        } else {
136            // Fall back to simple fuzzy string similarity
137            let fuzzy_score = self.compute_fuzzy_score(a, b);
138            if fuzzy_score >= self.config.threshold {
139                return fuzzy_score;
140            }
141        }
142
143        0.0
144    }
145
146    /// Check if components match via alias table
147    fn check_alias_match(&self, a: &Component, b: &Component) -> bool {
148        // Check if either component's name is an alias of the other
149        let names_a = self.get_all_names(a);
150        let names_b = self.get_all_names(b);
151
152        for name_a in &names_a {
153            if let Some(canonical) = self.alias_table.get_canonical(name_a) {
154                for name_b in &names_b {
155                    if self.alias_table.is_alias(&canonical, name_b) {
156                        return true;
157                    }
158                }
159            }
160        }
161
162        false
163    }
164
165    /// Get all possible names for a component
166    fn get_all_names(&self, comp: &Component) -> Vec<String> {
167        let mut names = vec![comp.name.clone()];
168        names.extend(comp.identifiers.aliases.clone());
169
170        // Extract name from PURL if available
171        if let Some(purl) = &comp.identifiers.purl {
172            if let Some(name) = self.extract_name_from_purl(purl) {
173                names.push(name);
174            }
175        }
176
177        names
178    }
179
180    /// Extract the package name from a PURL
181    fn extract_name_from_purl(&self, purl: &str) -> Option<String> {
182        // pkg:type/namespace/name@version?qualifiers#subpath
183        let without_pkg = purl.strip_prefix("pkg:")?;
184        let parts: Vec<&str> = without_pkg.split('/').collect();
185
186        if parts.len() >= 2 {
187            let name_part = parts.last()?;
188            // Remove version and qualifiers
189            let name = name_part.split('@').next()?;
190            Some(name.to_string())
191        } else {
192            None
193        }
194    }
195
196    /// Check ecosystem-specific matching rules
197    fn check_ecosystem_rules(&self, a: &Component, b: &Component) -> Option<f64> {
198        let ecosystem_a = a.ecosystem.as_ref()?;
199        let ecosystem_b = b.ecosystem.as_ref()?;
200
201        // Must be same ecosystem for rule-based matching
202        if ecosystem_a != ecosystem_b {
203            return None;
204        }
205
206        let norm_a = self.ecosystem_rules.normalize_name(&a.name, ecosystem_a);
207        let norm_b = self.ecosystem_rules.normalize_name(&b.name, ecosystem_b);
208
209        if norm_a == norm_b {
210            return Some(0.90);
211        }
212
213        None
214    }
215
216    /// Compute fuzzy string similarity score
217    fn compute_fuzzy_score(&self, a: &Component, b: &Component) -> f64 {
218        let name_a = a.name.to_lowercase();
219        let name_b = b.name.to_lowercase();
220
221        // Compute Jaro-Winkler similarity
222        let jw_score = jaro_winkler(&name_a, &name_b);
223
224        // Compute normalized Levenshtein distance
225        let max_len = name_a.len().max(name_b.len());
226        let lev_distance = levenshtein(&name_a, &name_b);
227        let lev_score = if max_len > 0 {
228            1.0 - (lev_distance as f64 / max_len as f64)
229        } else {
230            1.0
231        };
232
233        // Compute token-based similarity (catches reordered names like "react-dom" vs "dom-react")
234        let token_score = Self::compute_token_similarity(&name_a, &name_b);
235
236        // Compute phonetic similarity (catches typos like "color" vs "colour")
237        let phonetic_score = Self::compute_phonetic_similarity(&name_a, &name_b);
238
239        // Weighted combination of character-based scores
240        let char_score = (jw_score * self.config.jaro_winkler_weight)
241            + (lev_score * self.config.levenshtein_weight);
242
243        // Use the MAXIMUM of character, token, and phonetic scores
244        // This allows each method to catch different types of variations
245        let combined = char_score.max(token_score).max(phonetic_score * 0.85);
246
247        // Version-aware boost (semantic version similarity)
248        let version_boost = Self::compute_version_similarity(&a.version, &b.version);
249
250        (combined + version_boost).min(1.0)
251    }
252
253    /// Compute token-based similarity using Jaccard index on name tokens.
254    ///
255    /// Splits names on common delimiters (-, _, ., @, /) and compares token sets.
256    /// This catches reordered names like "react-dom" ↔ "dom-react".
257    fn compute_token_similarity(name_a: &str, name_b: &str) -> f64 {
258        use std::collections::HashSet;
259
260        let tokens_a: HashSet<&str> = name_a
261            .split(['-', '_', '.', '@', '/'])
262            .filter(|t| !t.is_empty())
263            .collect();
264
265        let tokens_b: HashSet<&str> = name_b
266            .split(['-', '_', '.', '@', '/'])
267            .filter(|t| !t.is_empty())
268            .collect();
269
270        if tokens_a.is_empty() && tokens_b.is_empty() {
271            return 1.0;
272        }
273        if tokens_a.is_empty() || tokens_b.is_empty() {
274            return 0.0;
275        }
276
277        let intersection = tokens_a.intersection(&tokens_b).count();
278        let union = tokens_a.union(&tokens_b).count();
279
280        if union > 0 {
281            intersection as f64 / union as f64
282        } else {
283            0.0
284        }
285    }
286
287    /// Compute version similarity with semantic awareness.
288    ///
289    /// Returns a boost value (0.0 - 0.1) based on how similar versions are:
290    /// - Exact match: 0.10
291    /// - Same major.minor: 0.07
292    /// - Same major: 0.04
293    /// - Both present but different: 0.0
294    /// - One or both missing: 0.0
295    fn compute_version_similarity(va: &Option<String>, vb: &Option<String>) -> f64 {
296        match (va, vb) {
297            (Some(a), Some(b)) if a == b => 0.10, // Exact match
298            (Some(a), Some(b)) => {
299                // Parse semantic versions
300                let parts_a: Vec<&str> = a.split('.').collect();
301                let parts_b: Vec<&str> = b.split('.').collect();
302
303                // Extract major.minor.patch (handle non-numeric gracefully)
304                let major_a = parts_a.first().and_then(|s| s.parse::<u32>().ok());
305                let major_b = parts_b.first().and_then(|s| s.parse::<u32>().ok());
306                let minor_a = parts_a
307                    .get(1)
308                    .and_then(|s| s.split('-').next())
309                    .and_then(|s| s.parse::<u32>().ok());
310                let minor_b = parts_b
311                    .get(1)
312                    .and_then(|s| s.split('-').next())
313                    .and_then(|s| s.parse::<u32>().ok());
314
315                match (major_a, major_b, minor_a, minor_b) {
316                    (Some(ma), Some(mb), Some(mia), Some(mib)) if ma == mb && mia == mib => 0.07,
317                    (Some(ma), Some(mb), _, _) if ma == mb => 0.04,
318                    _ => 0.0,
319                }
320            }
321            _ => 0.0, // One or both missing
322        }
323    }
324
325    /// Compute Soundex code for phonetic matching.
326    ///
327    /// Soundex encodes names by their pronunciation, helping match:
328    /// - "color" ↔ "colour"
329    /// - "jason" ↔ "jayson"
330    /// - "smith" ↔ "smyth"
331    fn soundex(name: &str) -> String {
332        if name.is_empty() {
333            return String::new();
334        }
335
336        let name_upper: String = name
337            .to_uppercase()
338            .chars()
339            .filter(|c| c.is_ascii_alphabetic())
340            .collect();
341        if name_upper.is_empty() {
342            return String::new();
343        }
344
345        let mut chars = name_upper.chars();
346        let first_char = chars.next().expect("name_upper is non-empty after empty check above");
347        let mut code = String::with_capacity(4);
348        code.push(first_char);
349
350        let mut last_digit = Self::soundex_digit(first_char);
351
352        for c in chars {
353            let digit = Self::soundex_digit(c);
354            if digit != '0' && digit != last_digit {
355                code.push(digit);
356                if code.len() == 4 {
357                    break;
358                }
359            }
360            if digit != '0' {
361                last_digit = digit;
362            }
363        }
364
365        // Pad with zeros if needed
366        while code.len() < 4 {
367            code.push('0');
368        }
369
370        code
371    }
372
373    /// Get Soundex digit for a character.
374    fn soundex_digit(c: char) -> char {
375        match c {
376            'B' | 'F' | 'P' | 'V' => '1',
377            'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
378            'D' | 'T' => '3',
379            'L' => '4',
380            'M' | 'N' => '5',
381            'R' => '6',
382            _ => '0', // A, E, I, O, U, H, W, Y
383        }
384    }
385
386    /// Compute phonetic similarity using Soundex.
387    ///
388    /// Returns 1.0 if Soundex codes match, 0.0 otherwise.
389    /// Also checks individual tokens for partial phonetic matches.
390    pub fn compute_phonetic_similarity(name_a: &str, name_b: &str) -> f64 {
391        // Compare full name Soundex
392        let soundex_a = Self::soundex(name_a);
393        let soundex_b = Self::soundex(name_b);
394
395        if !soundex_a.is_empty() && soundex_a == soundex_b {
396            return 1.0;
397        }
398
399        // Compare token-by-token for compound names
400        let tokens_a: Vec<&str> = name_a
401            .split(|c: char| !c.is_alphanumeric())
402            .filter(|t| !t.is_empty())
403            .collect();
404        let tokens_b: Vec<&str> = name_b
405            .split(|c: char| !c.is_alphanumeric())
406            .filter(|t| !t.is_empty())
407            .collect();
408
409        if tokens_a.is_empty() || tokens_b.is_empty() {
410            return 0.0;
411        }
412
413        // Count matching Soundex codes between tokens
414        let mut matches = 0;
415        let total = tokens_a.len().max(tokens_b.len());
416
417        for ta in &tokens_a {
418            let sa = Self::soundex(ta);
419            if sa.is_empty() {
420                continue;
421            }
422            for tb in &tokens_b {
423                let sb = Self::soundex(tb);
424                if sa == sb {
425                    matches += 1;
426                    break;
427                }
428            }
429        }
430
431        if total == 0 {
432            0.0
433        } else {
434            matches as f64 / total as f64
435        }
436    }
437
438    /// Compute multi-field weighted score.
439    ///
440    /// Combines scores from multiple component fields based on configured weights.
441    pub fn compute_multi_field_score(
442        &self,
443        a: &Component,
444        b: &Component,
445        weights: &config::MultiFieldWeights,
446    ) -> MultiFieldScoreResult {
447        use std::collections::HashSet;
448
449        let mut result = MultiFieldScoreResult::default();
450
451        // 1. Name similarity (using fuzzy scoring)
452        let name_score = self.compute_fuzzy_score(a, b);
453        result.name_score = name_score;
454        result.total += name_score * weights.name;
455
456        // 2. Version match (graduated or binary scoring)
457        let version_score = if weights.version_divergence_enabled {
458            compute_version_divergence_score(&a.version, &b.version, weights)
459        } else {
460            // Legacy binary scoring
461            match (&a.version, &b.version) {
462                (Some(va), Some(vb)) if va == vb => 1.0,
463                (None, None) => 0.5, // Both missing = neutral
464                _ => 0.0,
465            }
466        };
467        result.version_score = version_score;
468        result.total += version_score * weights.version;
469
470        // 3. Ecosystem match (exact match = 1.0, mismatch applies penalty)
471        let (ecosystem_score, ecosystem_penalty) = match (&a.ecosystem, &b.ecosystem) {
472            (Some(ea), Some(eb)) if ea == eb => (1.0, 0.0),
473            (None, None) => (0.5, 0.0), // Both missing = neutral, no penalty
474            (Some(_), Some(_)) => (0.0, weights.ecosystem_mismatch_penalty), // Different ecosystems = penalty
475            _ => (0.0, 0.0), // One missing = no match but no penalty
476        };
477        result.ecosystem_score = ecosystem_score;
478        result.total += ecosystem_score * weights.ecosystem + ecosystem_penalty;
479
480        // 4. License overlap (Jaccard similarity on declared licenses)
481        let licenses_a: HashSet<_> = a
482            .licenses
483            .declared
484            .iter()
485            .map(|l| l.expression.as_str())
486            .collect();
487        let licenses_b: HashSet<_> = b
488            .licenses
489            .declared
490            .iter()
491            .map(|l| l.expression.as_str())
492            .collect();
493        let license_score = if licenses_a.is_empty() && licenses_b.is_empty() {
494            0.5 // Both empty = neutral
495        } else if licenses_a.is_empty() || licenses_b.is_empty() {
496            0.0 // One empty = no match
497        } else {
498            let intersection = licenses_a.intersection(&licenses_b).count();
499            let union = licenses_a.union(&licenses_b).count();
500            if union > 0 {
501                intersection as f64 / union as f64
502            } else {
503                0.0
504            }
505        };
506        result.license_score = license_score;
507        result.total += license_score * weights.licenses;
508
509        // 5. Supplier match (exact match on supplier organization name)
510        let supplier_score = match (&a.supplier, &b.supplier) {
511            (Some(sa), Some(sb)) if sa.name.to_lowercase() == sb.name.to_lowercase() => 1.0,
512            (None, None) => 0.5, // Both missing = neutral
513            _ => 0.0,
514        };
515        result.supplier_score = supplier_score;
516        result.total += supplier_score * weights.supplier;
517
518        // 6. Group/namespace match
519        let group_score = match (&a.group, &b.group) {
520            (Some(ga), Some(gb)) if ga.to_lowercase() == gb.to_lowercase() => 1.0,
521            (None, None) => 0.5, // Both missing = neutral
522            _ => 0.0,
523        };
524        result.group_score = group_score;
525        result.total += group_score * weights.group;
526
527        // Clamp total to [0.0, 1.0] after penalty application
528        result.total = result.total.clamp(0.0, 1.0);
529
530        result
531    }
532}
533
534/// Compute version divergence score using semver distance.
535///
536/// Returns a graduated score based on how different the versions are:
537/// - Exact match: 1.0
538/// - Same major.minor: 0.8 - (patch_diff * 0.01), min 0.5
539/// - Same major: 0.5 - (minor_diff * minor_penalty), min 0.2
540/// - Different major: 0.3 - (major_diff * major_penalty), min 0.0
541fn compute_version_divergence_score(
542    version_a: &Option<String>,
543    version_b: &Option<String>,
544    weights: &config::MultiFieldWeights,
545) -> f64 {
546    match (version_a, version_b) {
547        (Some(va), Some(vb)) if va == vb => 1.0,
548        (None, None) => 0.5, // Both missing = neutral
549        (Some(va), Some(vb)) => {
550            // Parse semver components
551            let parts_a = parse_semver_parts(va);
552            let parts_b = parse_semver_parts(vb);
553
554            match (parts_a, parts_b) {
555                (Some((maj_a, min_a, patch_a)), Some((maj_b, min_b, patch_b))) => {
556                    if maj_a == maj_b && min_a == min_b {
557                        // Same major.minor - small penalty for patch difference
558                        let patch_diff = (patch_a as i64 - patch_b as i64).unsigned_abs() as f64;
559                        (0.8 - patch_diff * 0.01).max(0.5)
560                    } else if maj_a == maj_b {
561                        // Same major - moderate penalty for minor difference
562                        let minor_diff = (min_a as i64 - min_b as i64).unsigned_abs() as f64;
563                        (0.5 - minor_diff * weights.version_minor_penalty).max(0.2)
564                    } else {
565                        // Different major - larger penalty
566                        let major_diff = (maj_a as i64 - maj_b as i64).unsigned_abs() as f64;
567                        (0.3 - major_diff * weights.version_major_penalty).max(0.0)
568                    }
569                }
570                _ => {
571                    // Couldn't parse semver - fall back to string comparison
572                    // Give partial credit if versions share a common prefix
573                    let common_prefix_len = va
574                        .chars()
575                        .zip(vb.chars())
576                        .take_while(|(a, b)| a == b)
577                        .count();
578                    let max_len = va.len().max(vb.len());
579                    if max_len > 0 && common_prefix_len > 0 {
580                        (common_prefix_len as f64 / max_len as f64 * 0.5).min(0.4)
581                    } else {
582                        0.1 // Different versions with no common prefix
583                    }
584                }
585            }
586        }
587        _ => 0.0, // One missing
588    }
589}
590
591/// Parse a version string into semver components (major, minor, patch).
592/// Returns None if the version cannot be parsed.
593fn parse_semver_parts(version: &str) -> Option<(u32, u32, u32)> {
594    // Strip common prefixes like 'v' or 'V'
595    let version = version.trim_start_matches(['v', 'V']);
596
597    // Split on '.' and try to parse first three components
598    let mut parts = version.split(['.', '-', '+']);
599
600    let major: u32 = parts.next()?.parse().ok()?;
601    let minor: u32 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
602    let patch: u32 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
603
604    Some((major, minor, patch))
605}
606
607/// Result of multi-field scoring with per-field breakdown.
608#[derive(Debug, Clone, Default)]
609pub struct MultiFieldScoreResult {
610    /// Total weighted score (0.0 - 1.0)
611    pub total: f64,
612    /// Name similarity score
613    pub name_score: f64,
614    /// Version match score
615    pub version_score: f64,
616    /// Ecosystem match score
617    pub ecosystem_score: f64,
618    /// License overlap score (Jaccard)
619    pub license_score: f64,
620    /// Supplier match score
621    pub supplier_score: f64,
622    /// Group/namespace match score
623    pub group_score: f64,
624}
625
626impl MultiFieldScoreResult {
627    /// Get a human-readable summary of the score breakdown.
628    pub fn summary(&self) -> String {
629        format!(
630            "Total: {:.2} (name: {:.2}, version: {:.2}, ecosystem: {:.2}, licenses: {:.2}, supplier: {:.2}, group: {:.2})",
631            self.total,
632            self.name_score,
633            self.version_score,
634            self.ecosystem_score,
635            self.license_score,
636            self.supplier_score,
637            self.group_score
638        )
639    }
640}
641
642impl Default for FuzzyMatcher {
643    fn default() -> Self {
644        Self::new(FuzzyMatchConfig::balanced())
645    }
646}
647
648impl ComponentMatcher for FuzzyMatcher {
649    fn match_score(&self, a: &Component, b: &Component) -> f64 {
650        self.match_components(a, b)
651    }
652
653    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
654        // Layer 1: Exact PURL match
655        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
656            let norm_a = self.purl_normalizer.normalize(purl_a);
657            let norm_b = self.purl_normalizer.normalize(purl_b);
658            if norm_a == norm_b {
659                return MatchResult::with_metadata(
660                    1.0,
661                    MatchTier::ExactIdentifier,
662                    MatchMetadata {
663                        matched_fields: vec!["purl".to_string()],
664                        normalization: Some("purl_normalized".to_string()),
665                        rule_id: None,
666                    },
667                );
668            }
669        }
670
671        // Layer 2: Alias table lookup
672        if self.check_alias_match(a, b) {
673            return MatchResult::with_metadata(
674                0.95,
675                MatchTier::Alias,
676                MatchMetadata {
677                    matched_fields: vec!["name".to_string()],
678                    normalization: Some("alias_table".to_string()),
679                    rule_id: None,
680                },
681            );
682        }
683
684        // Layer 3: Rule-based ecosystem normalization
685        if let Some(score) = self.check_ecosystem_rules(a, b) {
686            if score >= 0.90 {
687                return MatchResult::with_metadata(
688                    score,
689                    MatchTier::EcosystemRule,
690                    MatchMetadata {
691                        matched_fields: vec!["name".to_string(), "ecosystem".to_string()],
692                        normalization: Some("ecosystem_rules".to_string()),
693                        rule_id: None,
694                    },
695                );
696            }
697        }
698
699        // Layer 4: Fuzzy string similarity
700        let fuzzy_score = self.compute_fuzzy_score(a, b);
701        if fuzzy_score >= self.config.threshold {
702            return MatchResult::with_metadata(
703                fuzzy_score,
704                MatchTier::Fuzzy,
705                MatchMetadata {
706                    matched_fields: vec!["name".to_string()],
707                    normalization: Some("fuzzy_similarity".to_string()),
708                    rule_id: None,
709                },
710            );
711        }
712
713        MatchResult::no_match()
714    }
715
716    fn name(&self) -> &str {
717        "FuzzyMatcher"
718    }
719
720    fn threshold(&self) -> f64 {
721        self.config.threshold
722    }
723
724    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
725        use strsim::{jaro_winkler, levenshtein};
726
727        // Layer 1: Exact PURL match
728        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
729            let norm_a = self.purl_normalizer.normalize(purl_a);
730            let norm_b = self.purl_normalizer.normalize(purl_b);
731            if norm_a == norm_b {
732                return MatchExplanation::matched(
733                    MatchTier::ExactIdentifier,
734                    1.0,
735                    format!(
736                        "Exact PURL match: '{}' equals '{}' after normalization",
737                        purl_a, purl_b
738                    ),
739                )
740                .with_normalization("purl_normalized");
741            }
742        }
743
744        // Layer 2: Alias table lookup
745        if self.check_alias_match(a, b) {
746            return MatchExplanation::matched(
747                MatchTier::Alias,
748                0.95,
749                format!(
750                    "'{}' and '{}' are known aliases of the same package",
751                    a.name, b.name
752                ),
753            )
754            .with_normalization("alias_table");
755        }
756
757        // Layer 3: Rule-based ecosystem normalization
758        if let Some(score) = self.check_ecosystem_rules(a, b) {
759            if score >= 0.90 {
760                let ecosystem = a
761                    .ecosystem
762                    .as_ref()
763                    .map(|e| e.to_string())
764                    .unwrap_or_else(|| "unknown".to_string());
765                return MatchExplanation::matched(
766                    MatchTier::EcosystemRule,
767                    score,
768                    format!(
769                        "Names match after {} ecosystem normalization: '{}' -> '{}'",
770                        ecosystem, a.name, b.name
771                    ),
772                )
773                .with_normalization(format!("{}_normalization", ecosystem));
774            }
775        }
776
777        // Layer 4: Fuzzy string similarity - compute detailed breakdown
778        let name_a = a.name.to_lowercase();
779        let name_b = b.name.to_lowercase();
780
781        let jw_score = jaro_winkler(&name_a, &name_b);
782        let max_len = name_a.len().max(name_b.len());
783        let lev_distance = levenshtein(&name_a, &name_b);
784        let lev_score = if max_len > 0 {
785            1.0 - (lev_distance as f64 / max_len as f64)
786        } else {
787            1.0
788        };
789
790        let jw_weighted = jw_score * self.config.jaro_winkler_weight;
791        let lev_weighted = lev_score * self.config.levenshtein_weight;
792
793        let version_boost = if a.version == b.version && a.version.is_some() {
794            0.05
795        } else {
796            0.0
797        };
798
799        let combined = (jw_weighted + lev_weighted + version_boost).min(1.0);
800
801        let mut explanation = if combined >= self.config.threshold {
802            MatchExplanation::matched(
803                MatchTier::Fuzzy,
804                combined,
805                format!(
806                    "Fuzzy match: '{}' ~ '{}' with {:.0}% similarity",
807                    a.name,
808                    b.name,
809                    combined * 100.0
810                ),
811            )
812        } else {
813            MatchExplanation::no_match(format!(
814                "Fuzzy similarity {:.2} below threshold {:.2}",
815                combined, self.config.threshold
816            ))
817        };
818
819        // Add score breakdown
820        explanation = explanation
821            .with_score_component(ScoreComponent {
822                name: "Jaro-Winkler".to_string(),
823                weight: self.config.jaro_winkler_weight,
824                raw_score: jw_score,
825                weighted_score: jw_weighted,
826                description: format!("'{}' vs '{}' = {:.2}", name_a, name_b, jw_score),
827            })
828            .with_score_component(ScoreComponent {
829                name: "Levenshtein".to_string(),
830                weight: self.config.levenshtein_weight,
831                raw_score: lev_score,
832                weighted_score: lev_weighted,
833                description: format!(
834                    "edit distance {} / max_len {} = {:.2}",
835                    lev_distance, max_len, lev_score
836                ),
837            });
838
839        if version_boost > 0.0 {
840            explanation = explanation.with_score_component(ScoreComponent {
841                name: "Version boost".to_string(),
842                weight: 1.0,
843                raw_score: version_boost,
844                weighted_score: version_boost,
845                description: format!("versions match: {:?}", a.version),
846            });
847        }
848
849        explanation.with_normalization("lowercase")
850    }
851}
852
853#[cfg(test)]
854mod tests {
855    use super::*;
856
857    #[test]
858    fn test_exact_purl_match() {
859        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
860
861        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
862        a.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
863
864        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
865        b.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
866
867        assert_eq!(matcher.match_components(&a, &b), 1.0);
868    }
869
870    #[test]
871    fn test_fuzzy_name_match() {
872        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
873
874        // Similar names should have some fuzzy match score
875        let a = Component::new("lodash-es".to_string(), "comp-1".to_string());
876        let b = Component::new("lodash".to_string(), "comp-2".to_string());
877
878        let score = matcher.match_components(&a, &b);
879        // With permissive threshold (0.70), similar names should match
880        assert!(
881            score >= 0.70,
882            "lodash-es vs lodash should have score >= 0.70, got {}",
883            score
884        );
885    }
886
887    #[test]
888    fn test_different_names_low_score() {
889        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::strict());
890
891        let a = Component::new("react".to_string(), "comp-1".to_string());
892        let b = Component::new("angular".to_string(), "comp-2".to_string());
893
894        let score = matcher.match_components(&a, &b);
895        assert!(
896            score < 0.5,
897            "react vs angular should have low score, got {}",
898            score
899        );
900    }
901
902    #[test]
903    fn test_multi_field_weights_normalized() {
904        let weights = config::MultiFieldWeights::balanced();
905        assert!(
906            weights.is_normalized(),
907            "Balanced weights should be normalized"
908        );
909
910        let weights = config::MultiFieldWeights::name_focused();
911        assert!(
912            weights.is_normalized(),
913            "Name-focused weights should be normalized"
914        );
915
916        let weights = config::MultiFieldWeights::security_focused();
917        assert!(
918            weights.is_normalized(),
919            "Security-focused weights should be normalized"
920        );
921    }
922
923    #[test]
924    fn test_multi_field_scoring_same_component() {
925        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
926        let weights = config::MultiFieldWeights::balanced();
927
928        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
929        a.version = Some("4.17.21".to_string());
930        a.ecosystem = Some(crate::model::Ecosystem::Npm);
931
932        // Identical component should score very high
933        // Note: empty licenses/supplier/group get neutral 0.5 score, so total won't be 1.0
934        let result = matcher.compute_multi_field_score(&a, &a, &weights);
935        assert!(
936            result.total > 0.90,
937            "Same component should score > 0.90, got {}",
938            result.total
939        );
940        assert_eq!(result.name_score, 1.0);
941        assert_eq!(result.version_score, 1.0);
942        assert_eq!(result.ecosystem_score, 1.0);
943        // Empty fields get neutral 0.5 score
944        assert_eq!(
945            result.license_score, 0.5,
946            "Empty licenses should be neutral"
947        );
948        assert_eq!(
949            result.supplier_score, 0.5,
950            "Empty supplier should be neutral"
951        );
952        assert_eq!(result.group_score, 0.5, "Empty group should be neutral");
953    }
954
955    #[test]
956    fn test_multi_field_scoring_different_versions() {
957        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
958        let weights = config::MultiFieldWeights::balanced();
959
960        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
961        a.version = Some("4.17.21".to_string());
962        a.ecosystem = Some(crate::model::Ecosystem::Npm);
963
964        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
965        b.version = Some("4.17.20".to_string()); // Different patch version
966        b.ecosystem = Some(crate::model::Ecosystem::Npm);
967
968        let result = matcher.compute_multi_field_score(&a, &b, &weights);
969
970        // Name matches perfectly
971        assert!(result.name_score > 0.9, "Name score should be > 0.9");
972
973        // Graduated version scoring: same major.minor gives high score
974        // 4.17.21 vs 4.17.20 = same major.minor, patch diff of 1
975        // Expected: 0.8 - 0.01 * 1 = 0.79
976        assert!(
977            result.version_score > 0.7,
978            "Same major.minor with patch diff should score high, got {}",
979            result.version_score
980        );
981
982        // Ecosystem matches
983        assert_eq!(
984            result.ecosystem_score, 1.0,
985            "Same ecosystem should score 1.0"
986        );
987
988        // Total should be high due to name, ecosystem, and graduated version score
989        assert!(
990            result.total > 0.8,
991            "Total should be > 0.8, got {}",
992            result.total
993        );
994    }
995
996    #[test]
997    fn test_multi_field_scoring_different_major_versions() {
998        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
999        let weights = config::MultiFieldWeights::balanced();
1000
1001        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
1002        a.version = Some("4.17.21".to_string());
1003        a.ecosystem = Some(crate::model::Ecosystem::Npm);
1004
1005        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
1006        b.version = Some("3.10.0".to_string()); // Different major version
1007        b.ecosystem = Some(crate::model::Ecosystem::Npm);
1008
1009        let result = matcher.compute_multi_field_score(&a, &b, &weights);
1010
1011        // Graduated version scoring: different major gives low score
1012        // 4 vs 3 = major diff of 1
1013        // Expected: 0.3 - 0.10 * 1 = 0.20
1014        assert!(
1015            result.version_score < 0.3,
1016            "Different major versions should score low, got {}",
1017            result.version_score
1018        );
1019    }
1020
1021    #[test]
1022    fn test_multi_field_scoring_legacy_weights() {
1023        // Test that legacy weights disable graduated scoring
1024        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
1025        let weights = config::MultiFieldWeights::legacy();
1026
1027        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
1028        a.version = Some("4.17.21".to_string());
1029        a.ecosystem = Some(crate::model::Ecosystem::Npm);
1030
1031        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
1032        b.version = Some("4.17.20".to_string());
1033        b.ecosystem = Some(crate::model::Ecosystem::Npm);
1034
1035        let result = matcher.compute_multi_field_score(&a, &b, &weights);
1036
1037        // Legacy mode: binary version scoring (exact match or 0)
1038        assert_eq!(
1039            result.version_score, 0.0,
1040            "Legacy mode: different versions should score 0"
1041        );
1042    }
1043
1044    #[test]
1045    fn test_multi_field_config_preset() {
1046        let config = FuzzyMatchConfig::from_preset("balanced-multi").unwrap();
1047        assert!(config.field_weights.is_some());
1048
1049        let config = FuzzyMatchConfig::from_preset("strict_multi").unwrap();
1050        assert!(config.field_weights.is_some());
1051    }
1052
1053    #[test]
1054    fn test_multi_field_score_result_summary() {
1055        let result = MultiFieldScoreResult {
1056            total: 0.85,
1057            name_score: 1.0,
1058            version_score: 0.0,
1059            ecosystem_score: 1.0,
1060            license_score: 0.5,
1061            supplier_score: 0.5,
1062            group_score: 0.5,
1063        };
1064
1065        let summary = result.summary();
1066        assert!(summary.contains("0.85"));
1067        assert!(summary.contains("name: 1.00"));
1068    }
1069
1070    #[test]
1071    fn test_token_similarity_exact() {
1072        let score = FuzzyMatcher::compute_token_similarity("react-dom", "react-dom");
1073        assert_eq!(score, 1.0);
1074    }
1075
1076    #[test]
1077    fn test_token_similarity_reordered() {
1078        // Reordered tokens should have high similarity
1079        let score = FuzzyMatcher::compute_token_similarity("react-dom", "dom-react");
1080        assert_eq!(score, 1.0, "Reordered tokens should match perfectly");
1081    }
1082
1083    #[test]
1084    fn test_token_similarity_partial() {
1085        // Partial token overlap
1086        let score = FuzzyMatcher::compute_token_similarity("react-dom-utils", "react-dom");
1087        // Jaccard: 2 common / 3 total = 0.667
1088        assert!(
1089            (score - 0.667).abs() < 0.01,
1090            "Partial overlap should be ~0.67, got {}",
1091            score
1092        );
1093    }
1094
1095    #[test]
1096    fn test_token_similarity_different_delimiters() {
1097        // Different delimiters should still work
1098        let score = FuzzyMatcher::compute_token_similarity("my_package_name", "my-package-name");
1099        assert_eq!(score, 1.0, "Different delimiters should match");
1100    }
1101
1102    #[test]
1103    fn test_token_similarity_no_overlap() {
1104        let score = FuzzyMatcher::compute_token_similarity("react", "angular");
1105        assert_eq!(score, 0.0, "No common tokens should score 0");
1106    }
1107
1108    #[test]
1109    fn test_version_similarity_exact() {
1110        let score = FuzzyMatcher::compute_version_similarity(
1111            &Some("1.2.3".to_string()),
1112            &Some("1.2.3".to_string()),
1113        );
1114        assert_eq!(score, 0.10, "Exact version match should give max boost");
1115    }
1116
1117    #[test]
1118    fn test_version_similarity_same_major_minor() {
1119        let score = FuzzyMatcher::compute_version_similarity(
1120            &Some("1.2.3".to_string()),
1121            &Some("1.2.4".to_string()),
1122        );
1123        assert_eq!(score, 0.07, "Same major.minor should give 0.07 boost");
1124    }
1125
1126    #[test]
1127    fn test_version_similarity_same_major() {
1128        let score = FuzzyMatcher::compute_version_similarity(
1129            &Some("1.2.3".to_string()),
1130            &Some("1.5.0".to_string()),
1131        );
1132        assert_eq!(score, 0.04, "Same major should give 0.04 boost");
1133    }
1134
1135    #[test]
1136    fn test_version_similarity_different_major() {
1137        let score = FuzzyMatcher::compute_version_similarity(
1138            &Some("1.2.3".to_string()),
1139            &Some("2.0.0".to_string()),
1140        );
1141        assert_eq!(score, 0.0, "Different major versions should give no boost");
1142    }
1143
1144    #[test]
1145    fn test_version_similarity_prerelease() {
1146        // Handle prerelease versions like "1.2.3-beta"
1147        let score = FuzzyMatcher::compute_version_similarity(
1148            &Some("1.2.3-beta".to_string()),
1149            &Some("1.2.4".to_string()),
1150        );
1151        assert_eq!(score, 0.07, "Prerelease should still match major.minor");
1152    }
1153
1154    #[test]
1155    fn test_version_similarity_missing() {
1156        let score = FuzzyMatcher::compute_version_similarity(&None, &Some("1.0.0".to_string()));
1157        assert_eq!(score, 0.0, "Missing version should give no boost");
1158
1159        let score = FuzzyMatcher::compute_version_similarity(&None, &None);
1160        assert_eq!(score, 0.0, "Both missing should give no boost");
1161    }
1162
1163    #[test]
1164    fn test_fuzzy_match_with_reordered_tokens() {
1165        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
1166
1167        let a = Component::new("react-dom".to_string(), "comp-1".to_string());
1168        let b = Component::new("dom-react".to_string(), "comp-2".to_string());
1169
1170        let score = matcher.match_components(&a, &b);
1171        // Token similarity is 1.0, blended with character similarity
1172        assert!(
1173            score > 0.5,
1174            "Reordered names should still match, got {}",
1175            score
1176        );
1177    }
1178
1179    #[test]
1180    fn test_fuzzy_match_version_boost() {
1181        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
1182
1183        // Use slightly different names so we rely on fuzzy matching, not exact match
1184        let mut a = Component::new("lodash-utils".to_string(), "comp-1".to_string());
1185        a.version = Some("4.17.21".to_string());
1186
1187        let mut b = Component::new("lodash-util".to_string(), "comp-2".to_string());
1188        b.version = Some("4.17.20".to_string()); // Same major.minor -> +0.07 boost
1189
1190        let mut c = Component::new("lodash-util".to_string(), "comp-3".to_string());
1191        c.version = Some("5.0.0".to_string()); // Different major -> +0.0 boost
1192
1193        let score_same_minor = matcher.match_components(&a, &b);
1194        let score_diff_major = matcher.match_components(&a, &c);
1195
1196        // Both should match (fuzzy), but same_minor should have version boost
1197        assert!(score_same_minor > 0.0, "Same minor should match");
1198        assert!(score_diff_major > 0.0, "Different major should still match");
1199        assert!(
1200            score_same_minor > score_diff_major,
1201            "Same minor version should score higher: {} vs {}",
1202            score_same_minor,
1203            score_diff_major
1204        );
1205    }
1206
1207    #[test]
1208    fn test_soundex_basic() {
1209        // Test basic Soundex encoding
1210        assert_eq!(FuzzyMatcher::soundex("Robert"), "R163");
1211        assert_eq!(FuzzyMatcher::soundex("Rupert"), "R163"); // Same as Robert
1212        assert_eq!(FuzzyMatcher::soundex("Smith"), "S530");
1213        assert_eq!(FuzzyMatcher::soundex("Smyth"), "S530"); // Same as Smith
1214    }
1215
1216    #[test]
1217    fn test_soundex_empty() {
1218        assert_eq!(FuzzyMatcher::soundex(""), "");
1219        assert_eq!(FuzzyMatcher::soundex("123"), ""); // No letters
1220    }
1221
1222    #[test]
1223    fn test_phonetic_similarity_exact() {
1224        let score = FuzzyMatcher::compute_phonetic_similarity("color", "colour");
1225        assert_eq!(score, 1.0, "color and colour should match phonetically");
1226    }
1227
1228    #[test]
1229    fn test_phonetic_similarity_different() {
1230        let score = FuzzyMatcher::compute_phonetic_similarity("react", "angular");
1231        assert!(
1232            score < 0.5,
1233            "Different names should have low phonetic similarity"
1234        );
1235    }
1236
1237    #[test]
1238    fn test_phonetic_similarity_compound() {
1239        // Test compound names where tokens match phonetically
1240        let score = FuzzyMatcher::compute_phonetic_similarity("json-parser", "jayson-parser");
1241        assert!(
1242            score > 0.5,
1243            "Similar sounding compound names should match: {}",
1244            score
1245        );
1246    }
1247
1248    #[test]
1249    fn test_fuzzy_match_with_phonetic() {
1250        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
1251
1252        let a = Component::new("color-utils".to_string(), "comp-1".to_string());
1253        let b = Component::new("colour-utils".to_string(), "comp-2".to_string());
1254
1255        let score = matcher.match_components(&a, &b);
1256        assert!(
1257            score > 0.7,
1258            "Phonetically similar names should match: {}",
1259            score
1260        );
1261    }
1262}
sbom_tools/matching/mod.rs

sbom_tools/matching/
mod.rs