Skip to main content

sbom_tools/matching/
mod.rs

1//! Fuzzy matching engine for cross-ecosystem package correlation.
2//!
3//! This module provides multi-tier matching strategies for correlating
4//! components across different ecosystems and naming conventions.
5//!
6//! # Architecture
7//!
8//! The matching system is built on the [`ComponentMatcher`] trait, which
9//! provides a pluggable interface for different matching strategies:
10//!
11//! - [`FuzzyMatcher`]: Multi-tier fuzzy matching (default)
12//! - [`CompositeMatcher`]: Combines multiple matchers
13//! - [`CachedMatcher`]: Wraps any matcher with caching
14//!
15//! # Example
16//!
17//! ```ignore
18//! use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
19//!
20//! // Use the trait for dependency injection
21//! fn diff_with_matcher(matcher: &dyn ComponentMatcher) {
22//!     let score = matcher.match_score(&comp_a, &comp_b);
23//! }
24//!
25//! let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
26//! diff_with_matcher(&matcher);
27//! ```
28
29pub mod adaptive;
30mod aliases;
31mod config;
32pub mod cross_ecosystem;
33pub mod custom_rules;
34pub mod ecosystem_config;
35pub mod index;
36pub mod lsh;
37mod purl;
38pub mod rule_engine;
39mod rules;
40mod traits;
41
42#[cfg(feature = "ml-matching")]
43mod embedding;
44
45pub use adaptive::{
46    AdaptiveMatching, AdaptiveMethod, AdaptiveThreshold, AdaptiveThresholdConfig,
47    AdaptiveThresholdResult, ScoreStats,
48};
49pub use aliases::AliasTable;
50pub use config::{CrossEcosystemConfig, FuzzyMatchConfig, MultiFieldWeights};
51pub use cross_ecosystem::{CrossEcosystemDb, CrossEcosystemMatch, PackageFamily};
52pub use custom_rules::{
53    AliasPattern, EquivalenceGroup, ExclusionRule, MatchingRulesConfig, RulePrecedence,
54    RulesSummary,
55};
56pub use ecosystem_config::{
57    ConfigError, CustomEquivalence, CustomRules, EcosystemConfig, EcosystemRulesConfig,
58    GlobalSettings, GroupMigration, ImportMapping, NormalizationConfig, PackageGroup,
59    ScopeHandling, SecurityConfig, TyposquatEntry, VersionSpec, VersioningConfig,
60};
61pub use index::{
62    BatchCandidateConfig, BatchCandidateGenerator, BatchCandidateResult, BatchCandidateStats,
63    ComponentIndex, IndexStats, LazyComponentIndex, NormalizedEntry,
64};
65pub use lsh::{LshConfig, LshIndex, LshIndexStats, MinHashSignature};
66pub use purl::PurlNormalizer;
67pub use rule_engine::{AppliedRule, AppliedRuleType, RuleApplicationResult, RuleEngine};
68pub use rules::EcosystemRules;
69pub use traits::{
70    CacheConfig, CacheStats, CachedMatcher, ComponentMatcher, CompositeMatcher,
71    CompositeMatcherBuilder, MatchExplanation, MatchMetadata, MatchResult, MatchTier,
72    ScoreComponent,
73};
74
75use crate::model::Component;
76use strsim::{jaro_winkler, levenshtein};
77
78/// Fuzzy matcher for component correlation.
79pub struct FuzzyMatcher {
80    config: FuzzyMatchConfig,
81    alias_table: AliasTable,
82    purl_normalizer: PurlNormalizer,
83    ecosystem_rules: EcosystemRules,
84}
85
86impl FuzzyMatcher {
87    /// Create a new fuzzy matcher with the given configuration
88    pub fn new(config: FuzzyMatchConfig) -> Self {
89        Self {
90            config,
91            alias_table: AliasTable::default(),
92            purl_normalizer: PurlNormalizer::new(),
93            ecosystem_rules: EcosystemRules::new(),
94        }
95    }
96
97    /// Get the current configuration.
98    pub fn config(&self) -> &FuzzyMatchConfig {
99        &self.config
100    }
101
102    /// Create a matcher with a custom alias table
103    pub fn with_alias_table(mut self, table: AliasTable) -> Self {
104        self.alias_table = table;
105        self
106    }
107
108    /// Match two components and return a confidence score (0.0 - 1.0)
109    pub fn match_components(&self, a: &Component, b: &Component) -> f64 {
110        // Layer 1: Exact PURL match
111        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
112            let norm_a = self.purl_normalizer.normalize(purl_a);
113            let norm_b = self.purl_normalizer.normalize(purl_b);
114            if norm_a == norm_b {
115                return 1.0;
116            }
117        }
118
119        // Layer 2: Alias table lookup
120        if self.check_alias_match(a, b) {
121            return 0.95;
122        }
123
124        // Layer 3: Rule-based ecosystem normalization
125        if let Some(score) = self.check_ecosystem_rules(a, b) {
126            if score >= 0.90 {
127                return score;
128            }
129        }
130
131        // Layer 4: Multi-field weighted scoring (if configured) or fuzzy string similarity
132        if let Some(ref weights) = self.config.field_weights {
133            // Use multi-field scoring when configured
134            let result = self.compute_multi_field_score(a, b, weights);
135            if result.total >= self.config.threshold {
136                return result.total;
137            }
138        } else {
139            // Fall back to simple fuzzy string similarity
140            let fuzzy_score = self.compute_fuzzy_score(a, b);
141            if fuzzy_score >= self.config.threshold {
142                return fuzzy_score;
143            }
144        }
145
146        0.0
147    }
148
149    /// Check if components match via alias table
150    fn check_alias_match(&self, a: &Component, b: &Component) -> bool {
151        // Check if either component's name is an alias of the other
152        let names_a = self.get_all_names(a);
153        let names_b = self.get_all_names(b);
154
155        for name_a in &names_a {
156            if let Some(canonical) = self.alias_table.get_canonical(name_a) {
157                for name_b in &names_b {
158                    if self.alias_table.is_alias(&canonical, name_b) {
159                        return true;
160                    }
161                }
162            }
163        }
164
165        false
166    }
167
168    /// Get all possible names for a component
169    fn get_all_names(&self, comp: &Component) -> Vec<String> {
170        let mut names = vec![comp.name.clone()];
171        names.extend(comp.identifiers.aliases.clone());
172
173        // Extract name from PURL if available
174        if let Some(purl) = &comp.identifiers.purl {
175            if let Some(name) = self.extract_name_from_purl(purl) {
176                names.push(name);
177            }
178        }
179
180        names
181    }
182
183    /// Extract the package name from a PURL
184    fn extract_name_from_purl(&self, purl: &str) -> Option<String> {
185        // pkg:type/namespace/name@version?qualifiers#subpath
186        let without_pkg = purl.strip_prefix("pkg:")?;
187        let parts: Vec<&str> = without_pkg.split('/').collect();
188
189        if parts.len() >= 2 {
190            let name_part = parts.last()?;
191            // Remove version and qualifiers
192            let name = name_part.split('@').next()?;
193            Some(name.to_string())
194        } else {
195            None
196        }
197    }
198
199    /// Check ecosystem-specific matching rules
200    fn check_ecosystem_rules(&self, a: &Component, b: &Component) -> Option<f64> {
201        let ecosystem_a = a.ecosystem.as_ref()?;
202        let ecosystem_b = b.ecosystem.as_ref()?;
203
204        // Must be same ecosystem for rule-based matching
205        if ecosystem_a != ecosystem_b {
206            return None;
207        }
208
209        let norm_a = self.ecosystem_rules.normalize_name(&a.name, ecosystem_a);
210        let norm_b = self.ecosystem_rules.normalize_name(&b.name, ecosystem_b);
211
212        if norm_a == norm_b {
213            return Some(0.90);
214        }
215
216        None
217    }
218
219    /// Compute fuzzy string similarity score
220    fn compute_fuzzy_score(&self, a: &Component, b: &Component) -> f64 {
221        let name_a = a.name.to_lowercase();
222        let name_b = b.name.to_lowercase();
223
224        // Compute Jaro-Winkler similarity
225        let jw_score = jaro_winkler(&name_a, &name_b);
226
227        // Compute normalized Levenshtein distance
228        let max_len = name_a.len().max(name_b.len());
229        let lev_distance = levenshtein(&name_a, &name_b);
230        let lev_score = if max_len > 0 {
231            1.0 - (lev_distance as f64 / max_len as f64)
232        } else {
233            1.0
234        };
235
236        // Compute token-based similarity (catches reordered names like "react-dom" vs "dom-react")
237        let token_score = Self::compute_token_similarity(&name_a, &name_b);
238
239        // Compute phonetic similarity (catches typos like "color" vs "colour")
240        let phonetic_score = Self::compute_phonetic_similarity(&name_a, &name_b);
241
242        // Weighted combination of character-based scores
243        let char_score = (jw_score * self.config.jaro_winkler_weight)
244            + (lev_score * self.config.levenshtein_weight);
245
246        // Use the MAXIMUM of character, token, and phonetic scores
247        // This allows each method to catch different types of variations
248        let combined = char_score.max(token_score).max(phonetic_score * 0.85);
249
250        // Version-aware boost (semantic version similarity)
251        let version_boost = Self::compute_version_similarity(&a.version, &b.version);
252
253        (combined + version_boost).min(1.0)
254    }
255
256    /// Compute token-based similarity using Jaccard index on name tokens.
257    ///
258    /// Splits names on common delimiters (-, _, ., @, /) and compares token sets.
259    /// This catches reordered names like "react-dom" ↔ "dom-react".
260    fn compute_token_similarity(name_a: &str, name_b: &str) -> f64 {
261        use std::collections::HashSet;
262
263        let tokens_a: HashSet<&str> = name_a
264            .split(['-', '_', '.', '@', '/'])
265            .filter(|t| !t.is_empty())
266            .collect();
267
268        let tokens_b: HashSet<&str> = name_b
269            .split(['-', '_', '.', '@', '/'])
270            .filter(|t| !t.is_empty())
271            .collect();
272
273        if tokens_a.is_empty() && tokens_b.is_empty() {
274            return 1.0;
275        }
276        if tokens_a.is_empty() || tokens_b.is_empty() {
277            return 0.0;
278        }
279
280        let intersection = tokens_a.intersection(&tokens_b).count();
281        let union = tokens_a.union(&tokens_b).count();
282
283        if union > 0 {
284            intersection as f64 / union as f64
285        } else {
286            0.0
287        }
288    }
289
290    /// Compute version similarity with semantic awareness.
291    ///
292    /// Returns a boost value (0.0 - 0.1) based on how similar versions are:
293    /// - Exact match: 0.10
294    /// - Same major.minor: 0.07
295    /// - Same major: 0.04
296    /// - Both present but different: 0.0
297    /// - One or both missing: 0.0
298    fn compute_version_similarity(va: &Option<String>, vb: &Option<String>) -> f64 {
299        match (va, vb) {
300            (Some(a), Some(b)) if a == b => 0.10, // Exact match
301            (Some(a), Some(b)) => {
302                // Parse semantic versions
303                let parts_a: Vec<&str> = a.split('.').collect();
304                let parts_b: Vec<&str> = b.split('.').collect();
305
306                // Extract major.minor.patch (handle non-numeric gracefully)
307                let major_a = parts_a.first().and_then(|s| s.parse::<u32>().ok());
308                let major_b = parts_b.first().and_then(|s| s.parse::<u32>().ok());
309                let minor_a = parts_a
310                    .get(1)
311                    .and_then(|s| s.split('-').next())
312                    .and_then(|s| s.parse::<u32>().ok());
313                let minor_b = parts_b
314                    .get(1)
315                    .and_then(|s| s.split('-').next())
316                    .and_then(|s| s.parse::<u32>().ok());
317
318                match (major_a, major_b, minor_a, minor_b) {
319                    (Some(ma), Some(mb), Some(mia), Some(mib)) if ma == mb && mia == mib => 0.07,
320                    (Some(ma), Some(mb), _, _) if ma == mb => 0.04,
321                    _ => 0.0,
322                }
323            }
324            _ => 0.0, // One or both missing
325        }
326    }
327
328    /// Compute Soundex code for phonetic matching.
329    ///
330    /// Soundex encodes names by their pronunciation, helping match:
331    /// - "color" ↔ "colour"
332    /// - "jason" ↔ "jayson"
333    /// - "smith" ↔ "smyth"
334    fn soundex(name: &str) -> String {
335        if name.is_empty() {
336            return String::new();
337        }
338
339        let name_upper: String = name
340            .to_uppercase()
341            .chars()
342            .filter(|c| c.is_ascii_alphabetic())
343            .collect();
344        if name_upper.is_empty() {
345            return String::new();
346        }
347
348        let mut chars = name_upper.chars();
349        let first_char = chars.next().expect("name_upper is non-empty after empty check above");
350        let mut code = String::with_capacity(4);
351        code.push(first_char);
352
353        let mut last_digit = Self::soundex_digit(first_char);
354
355        for c in chars {
356            let digit = Self::soundex_digit(c);
357            if digit != '0' && digit != last_digit {
358                code.push(digit);
359                if code.len() == 4 {
360                    break;
361                }
362            }
363            if digit != '0' {
364                last_digit = digit;
365            }
366        }
367
368        // Pad with zeros if needed
369        while code.len() < 4 {
370            code.push('0');
371        }
372
373        code
374    }
375
376    /// Get Soundex digit for a character.
377    fn soundex_digit(c: char) -> char {
378        match c {
379            'B' | 'F' | 'P' | 'V' => '1',
380            'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => '2',
381            'D' | 'T' => '3',
382            'L' => '4',
383            'M' | 'N' => '5',
384            'R' => '6',
385            _ => '0', // A, E, I, O, U, H, W, Y
386        }
387    }
388
389    /// Compute phonetic similarity using Soundex.
390    ///
391    /// Returns 1.0 if Soundex codes match, 0.0 otherwise.
392    /// Also checks individual tokens for partial phonetic matches.
393    pub fn compute_phonetic_similarity(name_a: &str, name_b: &str) -> f64 {
394        // Compare full name Soundex
395        let soundex_a = Self::soundex(name_a);
396        let soundex_b = Self::soundex(name_b);
397
398        if !soundex_a.is_empty() && soundex_a == soundex_b {
399            return 1.0;
400        }
401
402        // Compare token-by-token for compound names
403        let tokens_a: Vec<&str> = name_a
404            .split(|c: char| !c.is_alphanumeric())
405            .filter(|t| !t.is_empty())
406            .collect();
407        let tokens_b: Vec<&str> = name_b
408            .split(|c: char| !c.is_alphanumeric())
409            .filter(|t| !t.is_empty())
410            .collect();
411
412        if tokens_a.is_empty() || tokens_b.is_empty() {
413            return 0.0;
414        }
415
416        // Count matching Soundex codes between tokens
417        let mut matches = 0;
418        let total = tokens_a.len().max(tokens_b.len());
419
420        for ta in &tokens_a {
421            let sa = Self::soundex(ta);
422            if sa.is_empty() {
423                continue;
424            }
425            for tb in &tokens_b {
426                let sb = Self::soundex(tb);
427                if sa == sb {
428                    matches += 1;
429                    break;
430                }
431            }
432        }
433
434        if total == 0 {
435            0.0
436        } else {
437            matches as f64 / total as f64
438        }
439    }
440
441    /// Compute multi-field weighted score.
442    ///
443    /// Combines scores from multiple component fields based on configured weights.
444    pub fn compute_multi_field_score(
445        &self,
446        a: &Component,
447        b: &Component,
448        weights: &config::MultiFieldWeights,
449    ) -> MultiFieldScoreResult {
450        use std::collections::HashSet;
451
452        let mut result = MultiFieldScoreResult::default();
453
454        // 1. Name similarity (using fuzzy scoring)
455        let name_score = self.compute_fuzzy_score(a, b);
456        result.name_score = name_score;
457        result.total += name_score * weights.name;
458
459        // 2. Version match (graduated or binary scoring)
460        let version_score = if weights.version_divergence_enabled {
461            compute_version_divergence_score(&a.version, &b.version, weights)
462        } else {
463            // Legacy binary scoring
464            match (&a.version, &b.version) {
465                (Some(va), Some(vb)) if va == vb => 1.0,
466                (None, None) => 0.5, // Both missing = neutral
467                _ => 0.0,
468            }
469        };
470        result.version_score = version_score;
471        result.total += version_score * weights.version;
472
473        // 3. Ecosystem match (exact match = 1.0, mismatch applies penalty)
474        let (ecosystem_score, ecosystem_penalty) = match (&a.ecosystem, &b.ecosystem) {
475            (Some(ea), Some(eb)) if ea == eb => (1.0, 0.0),
476            (None, None) => (0.5, 0.0), // Both missing = neutral, no penalty
477            (Some(_), Some(_)) => (0.0, weights.ecosystem_mismatch_penalty), // Different ecosystems = penalty
478            _ => (0.0, 0.0), // One missing = no match but no penalty
479        };
480        result.ecosystem_score = ecosystem_score;
481        result.total += ecosystem_score * weights.ecosystem + ecosystem_penalty;
482
483        // 4. License overlap (Jaccard similarity on declared licenses)
484        let licenses_a: HashSet<_> = a
485            .licenses
486            .declared
487            .iter()
488            .map(|l| l.expression.as_str())
489            .collect();
490        let licenses_b: HashSet<_> = b
491            .licenses
492            .declared
493            .iter()
494            .map(|l| l.expression.as_str())
495            .collect();
496        let license_score = if licenses_a.is_empty() && licenses_b.is_empty() {
497            0.5 // Both empty = neutral
498        } else if licenses_a.is_empty() || licenses_b.is_empty() {
499            0.0 // One empty = no match
500        } else {
501            let intersection = licenses_a.intersection(&licenses_b).count();
502            let union = licenses_a.union(&licenses_b).count();
503            if union > 0 {
504                intersection as f64 / union as f64
505            } else {
506                0.0
507            }
508        };
509        result.license_score = license_score;
510        result.total += license_score * weights.licenses;
511
512        // 5. Supplier match (exact match on supplier organization name)
513        let supplier_score = match (&a.supplier, &b.supplier) {
514            (Some(sa), Some(sb)) if sa.name.to_lowercase() == sb.name.to_lowercase() => 1.0,
515            (None, None) => 0.5, // Both missing = neutral
516            _ => 0.0,
517        };
518        result.supplier_score = supplier_score;
519        result.total += supplier_score * weights.supplier;
520
521        // 6. Group/namespace match
522        let group_score = match (&a.group, &b.group) {
523            (Some(ga), Some(gb)) if ga.to_lowercase() == gb.to_lowercase() => 1.0,
524            (None, None) => 0.5, // Both missing = neutral
525            _ => 0.0,
526        };
527        result.group_score = group_score;
528        result.total += group_score * weights.group;
529
530        // Clamp total to [0.0, 1.0] after penalty application
531        result.total = result.total.clamp(0.0, 1.0);
532
533        result
534    }
535}
536
537/// Compute version divergence score using semver distance.
538///
539/// Returns a graduated score based on how different the versions are:
540/// - Exact match: 1.0
541/// - Same major.minor: 0.8 - (patch_diff * 0.01), min 0.5
542/// - Same major: 0.5 - (minor_diff * minor_penalty), min 0.2
543/// - Different major: 0.3 - (major_diff * major_penalty), min 0.0
544fn compute_version_divergence_score(
545    version_a: &Option<String>,
546    version_b: &Option<String>,
547    weights: &config::MultiFieldWeights,
548) -> f64 {
549    match (version_a, version_b) {
550        (Some(va), Some(vb)) if va == vb => 1.0,
551        (None, None) => 0.5, // Both missing = neutral
552        (Some(va), Some(vb)) => {
553            // Parse semver components
554            let parts_a = parse_semver_parts(va);
555            let parts_b = parse_semver_parts(vb);
556
557            match (parts_a, parts_b) {
558                (Some((maj_a, min_a, patch_a)), Some((maj_b, min_b, patch_b))) => {
559                    if maj_a == maj_b && min_a == min_b {
560                        // Same major.minor - small penalty for patch difference
561                        let patch_diff = (patch_a as i64 - patch_b as i64).unsigned_abs() as f64;
562                        (0.8 - patch_diff * 0.01).max(0.5)
563                    } else if maj_a == maj_b {
564                        // Same major - moderate penalty for minor difference
565                        let minor_diff = (min_a as i64 - min_b as i64).unsigned_abs() as f64;
566                        (0.5 - minor_diff * weights.version_minor_penalty).max(0.2)
567                    } else {
568                        // Different major - larger penalty
569                        let major_diff = (maj_a as i64 - maj_b as i64).unsigned_abs() as f64;
570                        (0.3 - major_diff * weights.version_major_penalty).max(0.0)
571                    }
572                }
573                _ => {
574                    // Couldn't parse semver - fall back to string comparison
575                    // Give partial credit if versions share a common prefix
576                    let common_prefix_len = va
577                        .chars()
578                        .zip(vb.chars())
579                        .take_while(|(a, b)| a == b)
580                        .count();
581                    let max_len = va.len().max(vb.len());
582                    if max_len > 0 && common_prefix_len > 0 {
583                        (common_prefix_len as f64 / max_len as f64 * 0.5).min(0.4)
584                    } else {
585                        0.1 // Different versions with no common prefix
586                    }
587                }
588            }
589        }
590        _ => 0.0, // One missing
591    }
592}
593
594/// Parse a version string into semver components (major, minor, patch).
595/// Returns None if the version cannot be parsed.
596fn parse_semver_parts(version: &str) -> Option<(u32, u32, u32)> {
597    // Strip common prefixes like 'v' or 'V'
598    let version = version.trim_start_matches(['v', 'V']);
599
600    // Split on '.' and try to parse first three components
601    let mut parts = version.split(['.', '-', '+']);
602
603    let major: u32 = parts.next()?.parse().ok()?;
604    let minor: u32 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
605    let patch: u32 = parts.next().and_then(|s| s.parse().ok()).unwrap_or(0);
606
607    Some((major, minor, patch))
608}
609
610/// Result of multi-field scoring with per-field breakdown.
611#[derive(Debug, Clone, Default)]
612pub struct MultiFieldScoreResult {
613    /// Total weighted score (0.0 - 1.0)
614    pub total: f64,
615    /// Name similarity score
616    pub name_score: f64,
617    /// Version match score
618    pub version_score: f64,
619    /// Ecosystem match score
620    pub ecosystem_score: f64,
621    /// License overlap score (Jaccard)
622    pub license_score: f64,
623    /// Supplier match score
624    pub supplier_score: f64,
625    /// Group/namespace match score
626    pub group_score: f64,
627}
628
629impl MultiFieldScoreResult {
630    /// Get a human-readable summary of the score breakdown.
631    pub fn summary(&self) -> String {
632        format!(
633            "Total: {:.2} (name: {:.2}, version: {:.2}, ecosystem: {:.2}, licenses: {:.2}, supplier: {:.2}, group: {:.2})",
634            self.total,
635            self.name_score,
636            self.version_score,
637            self.ecosystem_score,
638            self.license_score,
639            self.supplier_score,
640            self.group_score
641        )
642    }
643}
644
645impl Default for FuzzyMatcher {
646    fn default() -> Self {
647        Self::new(FuzzyMatchConfig::balanced())
648    }
649}
650
651impl ComponentMatcher for FuzzyMatcher {
652    fn match_score(&self, a: &Component, b: &Component) -> f64 {
653        self.match_components(a, b)
654    }
655
656    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
657        // Layer 1: Exact PURL match
658        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
659            let norm_a = self.purl_normalizer.normalize(purl_a);
660            let norm_b = self.purl_normalizer.normalize(purl_b);
661            if norm_a == norm_b {
662                return MatchResult::with_metadata(
663                    1.0,
664                    MatchTier::ExactIdentifier,
665                    MatchMetadata {
666                        matched_fields: vec!["purl".to_string()],
667                        normalization: Some("purl_normalized".to_string()),
668                        rule_id: None,
669                    },
670                );
671            }
672        }
673
674        // Layer 2: Alias table lookup
675        if self.check_alias_match(a, b) {
676            return MatchResult::with_metadata(
677                0.95,
678                MatchTier::Alias,
679                MatchMetadata {
680                    matched_fields: vec!["name".to_string()],
681                    normalization: Some("alias_table".to_string()),
682                    rule_id: None,
683                },
684            );
685        }
686
687        // Layer 3: Rule-based ecosystem normalization
688        if let Some(score) = self.check_ecosystem_rules(a, b) {
689            if score >= 0.90 {
690                return MatchResult::with_metadata(
691                    score,
692                    MatchTier::EcosystemRule,
693                    MatchMetadata {
694                        matched_fields: vec!["name".to_string(), "ecosystem".to_string()],
695                        normalization: Some("ecosystem_rules".to_string()),
696                        rule_id: None,
697                    },
698                );
699            }
700        }
701
702        // Layer 4: Fuzzy string similarity
703        let fuzzy_score = self.compute_fuzzy_score(a, b);
704        if fuzzy_score >= self.config.threshold {
705            return MatchResult::with_metadata(
706                fuzzy_score,
707                MatchTier::Fuzzy,
708                MatchMetadata {
709                    matched_fields: vec!["name".to_string()],
710                    normalization: Some("fuzzy_similarity".to_string()),
711                    rule_id: None,
712                },
713            );
714        }
715
716        MatchResult::no_match()
717    }
718
719    fn name(&self) -> &str {
720        "FuzzyMatcher"
721    }
722
723    fn threshold(&self) -> f64 {
724        self.config.threshold
725    }
726
727    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
728        use strsim::{jaro_winkler, levenshtein};
729
730        // Layer 1: Exact PURL match
731        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
732            let norm_a = self.purl_normalizer.normalize(purl_a);
733            let norm_b = self.purl_normalizer.normalize(purl_b);
734            if norm_a == norm_b {
735                return MatchExplanation::matched(
736                    MatchTier::ExactIdentifier,
737                    1.0,
738                    format!(
739                        "Exact PURL match: '{}' equals '{}' after normalization",
740                        purl_a, purl_b
741                    ),
742                )
743                .with_normalization("purl_normalized");
744            }
745        }
746
747        // Layer 2: Alias table lookup
748        if self.check_alias_match(a, b) {
749            return MatchExplanation::matched(
750                MatchTier::Alias,
751                0.95,
752                format!(
753                    "'{}' and '{}' are known aliases of the same package",
754                    a.name, b.name
755                ),
756            )
757            .with_normalization("alias_table");
758        }
759
760        // Layer 3: Rule-based ecosystem normalization
761        if let Some(score) = self.check_ecosystem_rules(a, b) {
762            if score >= 0.90 {
763                let ecosystem = a
764                    .ecosystem
765                    .as_ref()
766                    .map(|e| e.to_string())
767                    .unwrap_or_else(|| "unknown".to_string());
768                return MatchExplanation::matched(
769                    MatchTier::EcosystemRule,
770                    score,
771                    format!(
772                        "Names match after {} ecosystem normalization: '{}' -> '{}'",
773                        ecosystem, a.name, b.name
774                    ),
775                )
776                .with_normalization(format!("{}_normalization", ecosystem));
777            }
778        }
779
780        // Layer 4: Fuzzy string similarity - compute detailed breakdown
781        let name_a = a.name.to_lowercase();
782        let name_b = b.name.to_lowercase();
783
784        let jw_score = jaro_winkler(&name_a, &name_b);
785        let max_len = name_a.len().max(name_b.len());
786        let lev_distance = levenshtein(&name_a, &name_b);
787        let lev_score = if max_len > 0 {
788            1.0 - (lev_distance as f64 / max_len as f64)
789        } else {
790            1.0
791        };
792
793        let jw_weighted = jw_score * self.config.jaro_winkler_weight;
794        let lev_weighted = lev_score * self.config.levenshtein_weight;
795
796        let version_boost = if a.version == b.version && a.version.is_some() {
797            0.05
798        } else {
799            0.0
800        };
801
802        let combined = (jw_weighted + lev_weighted + version_boost).min(1.0);
803
804        let mut explanation = if combined >= self.config.threshold {
805            MatchExplanation::matched(
806                MatchTier::Fuzzy,
807                combined,
808                format!(
809                    "Fuzzy match: '{}' ~ '{}' with {:.0}% similarity",
810                    a.name,
811                    b.name,
812                    combined * 100.0
813                ),
814            )
815        } else {
816            MatchExplanation::no_match(format!(
817                "Fuzzy similarity {:.2} below threshold {:.2}",
818                combined, self.config.threshold
819            ))
820        };
821
822        // Add score breakdown
823        explanation = explanation
824            .with_score_component(ScoreComponent {
825                name: "Jaro-Winkler".to_string(),
826                weight: self.config.jaro_winkler_weight,
827                raw_score: jw_score,
828                weighted_score: jw_weighted,
829                description: format!("'{}' vs '{}' = {:.2}", name_a, name_b, jw_score),
830            })
831            .with_score_component(ScoreComponent {
832                name: "Levenshtein".to_string(),
833                weight: self.config.levenshtein_weight,
834                raw_score: lev_score,
835                weighted_score: lev_weighted,
836                description: format!(
837                    "edit distance {} / max_len {} = {:.2}",
838                    lev_distance, max_len, lev_score
839                ),
840            });
841
842        if version_boost > 0.0 {
843            explanation = explanation.with_score_component(ScoreComponent {
844                name: "Version boost".to_string(),
845                weight: 1.0,
846                raw_score: version_boost,
847                weighted_score: version_boost,
848                description: format!("versions match: {:?}", a.version),
849            });
850        }
851
852        explanation.with_normalization("lowercase")
853    }
854}
855
856#[cfg(test)]
857mod tests {
858    use super::*;
859
860    #[test]
861    fn test_exact_purl_match() {
862        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
863
864        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
865        a.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
866
867        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
868        b.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
869
870        assert_eq!(matcher.match_components(&a, &b), 1.0);
871    }
872
873    #[test]
874    fn test_fuzzy_name_match() {
875        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
876
877        // Similar names should have some fuzzy match score
878        let a = Component::new("lodash-es".to_string(), "comp-1".to_string());
879        let b = Component::new("lodash".to_string(), "comp-2".to_string());
880
881        let score = matcher.match_components(&a, &b);
882        // With permissive threshold (0.70), similar names should match
883        assert!(
884            score >= 0.70,
885            "lodash-es vs lodash should have score >= 0.70, got {}",
886            score
887        );
888    }
889
890    #[test]
891    fn test_different_names_low_score() {
892        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::strict());
893
894        let a = Component::new("react".to_string(), "comp-1".to_string());
895        let b = Component::new("angular".to_string(), "comp-2".to_string());
896
897        let score = matcher.match_components(&a, &b);
898        assert!(
899            score < 0.5,
900            "react vs angular should have low score, got {}",
901            score
902        );
903    }
904
905    #[test]
906    fn test_multi_field_weights_normalized() {
907        let weights = config::MultiFieldWeights::balanced();
908        assert!(
909            weights.is_normalized(),
910            "Balanced weights should be normalized"
911        );
912
913        let weights = config::MultiFieldWeights::name_focused();
914        assert!(
915            weights.is_normalized(),
916            "Name-focused weights should be normalized"
917        );
918
919        let weights = config::MultiFieldWeights::security_focused();
920        assert!(
921            weights.is_normalized(),
922            "Security-focused weights should be normalized"
923        );
924    }
925
926    #[test]
927    fn test_multi_field_scoring_same_component() {
928        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
929        let weights = config::MultiFieldWeights::balanced();
930
931        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
932        a.version = Some("4.17.21".to_string());
933        a.ecosystem = Some(crate::model::Ecosystem::Npm);
934
935        // Identical component should score very high
936        // Note: empty licenses/supplier/group get neutral 0.5 score, so total won't be 1.0
937        let result = matcher.compute_multi_field_score(&a, &a, &weights);
938        assert!(
939            result.total > 0.90,
940            "Same component should score > 0.90, got {}",
941            result.total
942        );
943        assert_eq!(result.name_score, 1.0);
944        assert_eq!(result.version_score, 1.0);
945        assert_eq!(result.ecosystem_score, 1.0);
946        // Empty fields get neutral 0.5 score
947        assert_eq!(
948            result.license_score, 0.5,
949            "Empty licenses should be neutral"
950        );
951        assert_eq!(
952            result.supplier_score, 0.5,
953            "Empty supplier should be neutral"
954        );
955        assert_eq!(result.group_score, 0.5, "Empty group should be neutral");
956    }
957
958    #[test]
959    fn test_multi_field_scoring_different_versions() {
960        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
961        let weights = config::MultiFieldWeights::balanced();
962
963        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
964        a.version = Some("4.17.21".to_string());
965        a.ecosystem = Some(crate::model::Ecosystem::Npm);
966
967        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
968        b.version = Some("4.17.20".to_string()); // Different patch version
969        b.ecosystem = Some(crate::model::Ecosystem::Npm);
970
971        let result = matcher.compute_multi_field_score(&a, &b, &weights);
972
973        // Name matches perfectly
974        assert!(result.name_score > 0.9, "Name score should be > 0.9");
975
976        // Graduated version scoring: same major.minor gives high score
977        // 4.17.21 vs 4.17.20 = same major.minor, patch diff of 1
978        // Expected: 0.8 - 0.01 * 1 = 0.79
979        assert!(
980            result.version_score > 0.7,
981            "Same major.minor with patch diff should score high, got {}",
982            result.version_score
983        );
984
985        // Ecosystem matches
986        assert_eq!(
987            result.ecosystem_score, 1.0,
988            "Same ecosystem should score 1.0"
989        );
990
991        // Total should be high due to name, ecosystem, and graduated version score
992        assert!(
993            result.total > 0.8,
994            "Total should be > 0.8, got {}",
995            result.total
996        );
997    }
998
999    #[test]
1000    fn test_multi_field_scoring_different_major_versions() {
1001        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
1002        let weights = config::MultiFieldWeights::balanced();
1003
1004        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
1005        a.version = Some("4.17.21".to_string());
1006        a.ecosystem = Some(crate::model::Ecosystem::Npm);
1007
1008        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
1009        b.version = Some("3.10.0".to_string()); // Different major version
1010        b.ecosystem = Some(crate::model::Ecosystem::Npm);
1011
1012        let result = matcher.compute_multi_field_score(&a, &b, &weights);
1013
1014        // Graduated version scoring: different major gives low score
1015        // 4 vs 3 = major diff of 1
1016        // Expected: 0.3 - 0.10 * 1 = 0.20
1017        assert!(
1018            result.version_score < 0.3,
1019            "Different major versions should score low, got {}",
1020            result.version_score
1021        );
1022    }
1023
1024    #[test]
1025    fn test_multi_field_scoring_legacy_weights() {
1026        // Test that legacy weights disable graduated scoring
1027        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
1028        let weights = config::MultiFieldWeights::legacy();
1029
1030        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
1031        a.version = Some("4.17.21".to_string());
1032        a.ecosystem = Some(crate::model::Ecosystem::Npm);
1033
1034        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
1035        b.version = Some("4.17.20".to_string());
1036        b.ecosystem = Some(crate::model::Ecosystem::Npm);
1037
1038        let result = matcher.compute_multi_field_score(&a, &b, &weights);
1039
1040        // Legacy mode: binary version scoring (exact match or 0)
1041        assert_eq!(
1042            result.version_score, 0.0,
1043            "Legacy mode: different versions should score 0"
1044        );
1045    }
1046
1047    #[test]
1048    fn test_multi_field_config_preset() {
1049        let config = FuzzyMatchConfig::from_preset("balanced-multi").unwrap();
1050        assert!(config.field_weights.is_some());
1051
1052        let config = FuzzyMatchConfig::from_preset("strict_multi").unwrap();
1053        assert!(config.field_weights.is_some());
1054    }
1055
1056    #[test]
1057    fn test_multi_field_score_result_summary() {
1058        let result = MultiFieldScoreResult {
1059            total: 0.85,
1060            name_score: 1.0,
1061            version_score: 0.0,
1062            ecosystem_score: 1.0,
1063            license_score: 0.5,
1064            supplier_score: 0.5,
1065            group_score: 0.5,
1066        };
1067
1068        let summary = result.summary();
1069        assert!(summary.contains("0.85"));
1070        assert!(summary.contains("name: 1.00"));
1071    }
1072
1073    #[test]
1074    fn test_token_similarity_exact() {
1075        let score = FuzzyMatcher::compute_token_similarity("react-dom", "react-dom");
1076        assert_eq!(score, 1.0);
1077    }
1078
1079    #[test]
1080    fn test_token_similarity_reordered() {
1081        // Reordered tokens should have high similarity
1082        let score = FuzzyMatcher::compute_token_similarity("react-dom", "dom-react");
1083        assert_eq!(score, 1.0, "Reordered tokens should match perfectly");
1084    }
1085
1086    #[test]
1087    fn test_token_similarity_partial() {
1088        // Partial token overlap
1089        let score = FuzzyMatcher::compute_token_similarity("react-dom-utils", "react-dom");
1090        // Jaccard: 2 common / 3 total = 0.667
1091        assert!(
1092            (score - 0.667).abs() < 0.01,
1093            "Partial overlap should be ~0.67, got {}",
1094            score
1095        );
1096    }
1097
1098    #[test]
1099    fn test_token_similarity_different_delimiters() {
1100        // Different delimiters should still work
1101        let score = FuzzyMatcher::compute_token_similarity("my_package_name", "my-package-name");
1102        assert_eq!(score, 1.0, "Different delimiters should match");
1103    }
1104
1105    #[test]
1106    fn test_token_similarity_no_overlap() {
1107        let score = FuzzyMatcher::compute_token_similarity("react", "angular");
1108        assert_eq!(score, 0.0, "No common tokens should score 0");
1109    }
1110
1111    #[test]
1112    fn test_version_similarity_exact() {
1113        let score = FuzzyMatcher::compute_version_similarity(
1114            &Some("1.2.3".to_string()),
1115            &Some("1.2.3".to_string()),
1116        );
1117        assert_eq!(score, 0.10, "Exact version match should give max boost");
1118    }
1119
1120    #[test]
1121    fn test_version_similarity_same_major_minor() {
1122        let score = FuzzyMatcher::compute_version_similarity(
1123            &Some("1.2.3".to_string()),
1124            &Some("1.2.4".to_string()),
1125        );
1126        assert_eq!(score, 0.07, "Same major.minor should give 0.07 boost");
1127    }
1128
1129    #[test]
1130    fn test_version_similarity_same_major() {
1131        let score = FuzzyMatcher::compute_version_similarity(
1132            &Some("1.2.3".to_string()),
1133            &Some("1.5.0".to_string()),
1134        );
1135        assert_eq!(score, 0.04, "Same major should give 0.04 boost");
1136    }
1137
1138    #[test]
1139    fn test_version_similarity_different_major() {
1140        let score = FuzzyMatcher::compute_version_similarity(
1141            &Some("1.2.3".to_string()),
1142            &Some("2.0.0".to_string()),
1143        );
1144        assert_eq!(score, 0.0, "Different major versions should give no boost");
1145    }
1146
1147    #[test]
1148    fn test_version_similarity_prerelease() {
1149        // Handle prerelease versions like "1.2.3-beta"
1150        let score = FuzzyMatcher::compute_version_similarity(
1151            &Some("1.2.3-beta".to_string()),
1152            &Some("1.2.4".to_string()),
1153        );
1154        assert_eq!(score, 0.07, "Prerelease should still match major.minor");
1155    }
1156
1157    #[test]
1158    fn test_version_similarity_missing() {
1159        let score = FuzzyMatcher::compute_version_similarity(&None, &Some("1.0.0".to_string()));
1160        assert_eq!(score, 0.0, "Missing version should give no boost");
1161
1162        let score = FuzzyMatcher::compute_version_similarity(&None, &None);
1163        assert_eq!(score, 0.0, "Both missing should give no boost");
1164    }
1165
1166    #[test]
1167    fn test_fuzzy_match_with_reordered_tokens() {
1168        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
1169
1170        let a = Component::new("react-dom".to_string(), "comp-1".to_string());
1171        let b = Component::new("dom-react".to_string(), "comp-2".to_string());
1172
1173        let score = matcher.match_components(&a, &b);
1174        // Token similarity is 1.0, blended with character similarity
1175        assert!(
1176            score > 0.5,
1177            "Reordered names should still match, got {}",
1178            score
1179        );
1180    }
1181
1182    #[test]
1183    fn test_fuzzy_match_version_boost() {
1184        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
1185
1186        // Use slightly different names so we rely on fuzzy matching, not exact match
1187        let mut a = Component::new("lodash-utils".to_string(), "comp-1".to_string());
1188        a.version = Some("4.17.21".to_string());
1189
1190        let mut b = Component::new("lodash-util".to_string(), "comp-2".to_string());
1191        b.version = Some("4.17.20".to_string()); // Same major.minor -> +0.07 boost
1192
1193        let mut c = Component::new("lodash-util".to_string(), "comp-3".to_string());
1194        c.version = Some("5.0.0".to_string()); // Different major -> +0.0 boost
1195
1196        let score_same_minor = matcher.match_components(&a, &b);
1197        let score_diff_major = matcher.match_components(&a, &c);
1198
1199        // Both should match (fuzzy), but same_minor should have version boost
1200        assert!(score_same_minor > 0.0, "Same minor should match");
1201        assert!(score_diff_major > 0.0, "Different major should still match");
1202        assert!(
1203            score_same_minor > score_diff_major,
1204            "Same minor version should score higher: {} vs {}",
1205            score_same_minor,
1206            score_diff_major
1207        );
1208    }
1209
1210    #[test]
1211    fn test_soundex_basic() {
1212        // Test basic Soundex encoding
1213        assert_eq!(FuzzyMatcher::soundex("Robert"), "R163");
1214        assert_eq!(FuzzyMatcher::soundex("Rupert"), "R163"); // Same as Robert
1215        assert_eq!(FuzzyMatcher::soundex("Smith"), "S530");
1216        assert_eq!(FuzzyMatcher::soundex("Smyth"), "S530"); // Same as Smith
1217    }
1218
1219    #[test]
1220    fn test_soundex_empty() {
1221        assert_eq!(FuzzyMatcher::soundex(""), "");
1222        assert_eq!(FuzzyMatcher::soundex("123"), ""); // No letters
1223    }
1224
1225    #[test]
1226    fn test_phonetic_similarity_exact() {
1227        let score = FuzzyMatcher::compute_phonetic_similarity("color", "colour");
1228        assert_eq!(score, 1.0, "color and colour should match phonetically");
1229    }
1230
1231    #[test]
1232    fn test_phonetic_similarity_different() {
1233        let score = FuzzyMatcher::compute_phonetic_similarity("react", "angular");
1234        assert!(
1235            score < 0.5,
1236            "Different names should have low phonetic similarity"
1237        );
1238    }
1239
1240    #[test]
1241    fn test_phonetic_similarity_compound() {
1242        // Test compound names where tokens match phonetically
1243        let score = FuzzyMatcher::compute_phonetic_similarity("json-parser", "jayson-parser");
1244        assert!(
1245            score > 0.5,
1246            "Similar sounding compound names should match: {}",
1247            score
1248        );
1249    }
1250
1251    #[test]
1252    fn test_fuzzy_match_with_phonetic() {
1253        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
1254
1255        let a = Component::new("color-utils".to_string(), "comp-1".to_string());
1256        let b = Component::new("colour-utils".to_string(), "comp-2".to_string());
1257
1258        let score = matcher.match_components(&a, &b);
1259        assert!(
1260            score > 0.7,
1261            "Phonetically similar names should match: {}",
1262            score
1263        );
1264    }
1265}