sbom_tools/matching/
mod.rs

1//! Fuzzy matching engine for cross-ecosystem package correlation.
2//!
3//! This module provides multi-tier matching strategies for correlating
4//! components across different ecosystems and naming conventions.
5//!
6//! # Architecture
7//!
8//! The matching system is built on the [`ComponentMatcher`] trait, which
9//! provides a pluggable interface for different matching strategies:
10//!
11//! - [`FuzzyMatcher`]: Multi-tier fuzzy matching (default)
12//! - [`CompositeMatcher`]: Combines multiple matchers
13//! - [`CachedMatcher`]: Wraps any matcher with caching
14//!
15//! # Example
16//!
17//! ```ignore
18//! use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
19//!
20//! // Use the trait for dependency injection
21//! fn diff_with_matcher(matcher: &dyn ComponentMatcher) {
22//!     let score = matcher.match_score(&comp_a, &comp_b);
23//! }
24//!
25//! let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
26//! diff_with_matcher(&matcher);
27//! ```
28
29pub mod adaptive;
30mod aliases;
31mod config;
32pub mod cross_ecosystem;
33pub mod custom_rules;
34pub mod ecosystem_config;
35pub mod index;
36pub mod lsh;
37mod purl;
38pub mod rule_engine;
39mod rules;
40pub mod scoring;
41pub mod string_similarity;
42mod traits;
43
44pub use adaptive::{
45    AdaptiveMatching, AdaptiveMethod, AdaptiveThreshold, AdaptiveThresholdConfig,
46    AdaptiveThresholdResult, ScoreStats,
47};
48pub use aliases::AliasTable;
49pub use config::{CrossEcosystemConfig, FuzzyMatchConfig, MultiFieldWeights};
50pub use cross_ecosystem::{CrossEcosystemDb, CrossEcosystemMatch, PackageFamily};
51pub use custom_rules::{
52    AliasPattern, EquivalenceGroup, ExclusionRule, MatchingRulesConfig, RulePrecedence,
53    RulesSummary,
54};
55pub use ecosystem_config::{
56    ConfigError, CustomEquivalence, CustomRules, EcosystemConfig, EcosystemRulesConfig,
57    GlobalSettings, GroupMigration, ImportMapping, NormalizationConfig, PackageGroup,
58    ScopeHandling, SecurityConfig, TyposquatEntry, VersionSpec, VersioningConfig,
59};
60pub use index::{
61    BatchCandidateConfig, BatchCandidateGenerator, BatchCandidateResult, BatchCandidateStats,
62    ComponentIndex, IndexStats, LazyComponentIndex, NormalizedEntry,
63};
64pub use lsh::{LshConfig, LshIndex, LshIndexStats, MinHashSignature};
65pub use purl::PurlNormalizer;
66pub use rule_engine::{AppliedRule, AppliedRuleType, RuleApplicationResult, RuleEngine};
67pub use rules::EcosystemRules;
68pub use scoring::MultiFieldScoreResult;
69pub use traits::{
70    CacheConfig, CacheStats, CachedMatcher, ComponentMatcher, CompositeMatcher,
71    CompositeMatcherBuilder, MatchExplanation, MatchMetadata, MatchResult, MatchTier,
72    ScoreComponent,
73};
74
75use crate::model::Component;
76use strsim::{jaro_winkler, levenshtein};
77
78/// Fuzzy matcher for component correlation.
79#[must_use]
80pub struct FuzzyMatcher {
81    config: FuzzyMatchConfig,
82    alias_table: AliasTable,
83    purl_normalizer: PurlNormalizer,
84    ecosystem_rules: EcosystemRules,
85}
86
87impl FuzzyMatcher {
88    /// Create a new fuzzy matcher with the given configuration
89    pub fn new(config: FuzzyMatchConfig) -> Self {
90        Self {
91            config,
92            alias_table: AliasTable::default(),
93            purl_normalizer: PurlNormalizer::new(),
94            ecosystem_rules: EcosystemRules::new(),
95        }
96    }
97
98    /// Get the current configuration.
99    #[must_use]
100    pub const fn config(&self) -> &FuzzyMatchConfig {
101        &self.config
102    }
103
104    /// Create a matcher with a custom alias table
105    pub fn with_alias_table(mut self, table: AliasTable) -> Self {
106        self.alias_table = table;
107        self
108    }
109
110    /// Match two components and return a confidence score (0.0 - 1.0)
111    #[must_use]
112    pub fn match_components(&self, a: &Component, b: &Component) -> f64 {
113        // Layer 1: Exact PURL match
114        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
115            let norm_a = self.purl_normalizer.normalize(purl_a);
116            let norm_b = self.purl_normalizer.normalize(purl_b);
117            if norm_a == norm_b {
118                return 1.0;
119            }
120        }
121
122        // Layer 2: Alias table lookup
123        if self.check_alias_match(a, b) {
124            return 0.95;
125        }
126
127        // Layer 3: Rule-based ecosystem normalization
128        if let Some(score) = self.check_ecosystem_rules(a, b)
129            && score >= 0.90
130        {
131            return score;
132        }
133
134        // Layer 4: Multi-field weighted scoring (if configured) or fuzzy string similarity
135        if let Some(ref weights) = self.config.field_weights {
136            // Use multi-field scoring when configured
137            let result = self.compute_multi_field_score(a, b, weights);
138            if result.total >= self.config.threshold {
139                return result.total;
140            }
141        } else {
142            // Fall back to simple fuzzy string similarity
143            let fuzzy_score = self.compute_fuzzy_score(a, b);
144            if fuzzy_score >= self.config.threshold {
145                return fuzzy_score;
146            }
147        }
148
149        0.0
150    }
151
152    /// Check if components match via alias table
153    fn check_alias_match(&self, a: &Component, b: &Component) -> bool {
154        // Check if either component's name is an alias of the other
155        let names_a = self.get_all_names(a);
156        let names_b = self.get_all_names(b);
157
158        for name_a in &names_a {
159            if let Some(canonical) = self.alias_table.get_canonical(name_a) {
160                for name_b in &names_b {
161                    if self.alias_table.is_alias(&canonical, name_b) {
162                        return true;
163                    }
164                }
165            }
166        }
167
168        false
169    }
170
171    /// Get all possible names for a component
172    fn get_all_names(&self, comp: &Component) -> Vec<String> {
173        let mut names = vec![comp.name.clone()];
174        names.extend(comp.identifiers.aliases.clone());
175
176        // Extract name from PURL if available
177        if let Some(purl) = &comp.identifiers.purl
178            && let Some(name) = self.extract_name_from_purl(purl)
179        {
180            names.push(name);
181        }
182
183        names
184    }
185
186    /// Extract the package name from a PURL
187    fn extract_name_from_purl(&self, purl: &str) -> Option<String> {
188        // pkg:type/namespace/name@version?qualifiers#subpath
189        let without_pkg = purl.strip_prefix("pkg:")?;
190        let parts: Vec<&str> = without_pkg.split('/').collect();
191
192        if parts.len() >= 2 {
193            let name_part = parts.last()?;
194            // Remove version and qualifiers
195            let name = name_part.split('@').next()?;
196            Some(name.to_string())
197        } else {
198            None
199        }
200    }
201
202    /// Check ecosystem-specific matching rules
203    fn check_ecosystem_rules(&self, a: &Component, b: &Component) -> Option<f64> {
204        let ecosystem_a = a.ecosystem.as_ref()?;
205        let ecosystem_b = b.ecosystem.as_ref()?;
206
207        // Must be same ecosystem for rule-based matching
208        if ecosystem_a != ecosystem_b {
209            return None;
210        }
211
212        let norm_a = self.ecosystem_rules.normalize_name(&a.name, ecosystem_a);
213        let norm_b = self.ecosystem_rules.normalize_name(&b.name, ecosystem_b);
214
215        if norm_a == norm_b {
216            return Some(0.90);
217        }
218
219        None
220    }
221
222    /// Compute fuzzy string similarity score
223    fn compute_fuzzy_score(&self, a: &Component, b: &Component) -> f64 {
224        let name_a = a.name.to_lowercase();
225        let name_b = b.name.to_lowercase();
226
227        // Compute Jaro-Winkler similarity
228        let jw_score = jaro_winkler(&name_a, &name_b);
229
230        // Compute normalized Levenshtein distance
231        let max_len = name_a.len().max(name_b.len());
232        let lev_distance = levenshtein(&name_a, &name_b);
233        let lev_score = if max_len > 0 {
234            1.0 - (lev_distance as f64 / max_len as f64)
235        } else {
236            1.0
237        };
238
239        // Compute token-based similarity (catches reordered names like "react-dom" vs "dom-react")
240        let token_score = Self::compute_token_similarity(&name_a, &name_b);
241
242        // Compute phonetic similarity (catches typos like "color" vs "colour")
243        let phonetic_score = Self::compute_phonetic_similarity(&name_a, &name_b);
244
245        // Weighted combination of character-based scores
246        let char_score = jw_score.mul_add(
247            self.config.jaro_winkler_weight,
248            lev_score * self.config.levenshtein_weight,
249        );
250
251        // Use the MAXIMUM of character, token, and phonetic scores
252        // This allows each method to catch different types of variations
253        let combined = char_score.max(token_score).max(phonetic_score * 0.85);
254
255        // Version-aware boost (semantic version similarity)
256        let version_boost =
257            Self::compute_version_similarity(a.version.as_ref(), b.version.as_ref());
258
259        (combined + version_boost).min(1.0)
260    }
261
262    /// Compute token-based similarity using Jaccard index on name tokens.
263    fn compute_token_similarity(name_a: &str, name_b: &str) -> f64 {
264        string_similarity::compute_token_similarity(name_a, name_b)
265    }
266
267    /// Compute version similarity with semantic awareness.
268    fn compute_version_similarity(va: Option<&String>, vb: Option<&String>) -> f64 {
269        string_similarity::compute_version_similarity(va, vb)
270    }
271
272    /// Compute phonetic similarity using Soundex.
273    #[must_use]
274    pub fn compute_phonetic_similarity(name_a: &str, name_b: &str) -> f64 {
275        string_similarity::compute_phonetic_similarity(name_a, name_b)
276    }
277
278    /// Compute multi-field weighted score.
279    ///
280    /// Combines scores from multiple component fields based on configured weights.
281    #[must_use]
282    pub fn compute_multi_field_score(
283        &self,
284        a: &Component,
285        b: &Component,
286        weights: &config::MultiFieldWeights,
287    ) -> scoring::MultiFieldScoreResult {
288        use std::collections::HashSet;
289
290        let mut result = scoring::MultiFieldScoreResult::default();
291
292        // 1. Name similarity (using fuzzy scoring)
293        let name_score = self.compute_fuzzy_score(a, b);
294        result.name_score = name_score;
295        result.total += name_score * weights.name;
296
297        // 2. Version match (graduated or binary scoring)
298        let version_score = if weights.version_divergence_enabled {
299            scoring::compute_version_divergence_score(&a.version, &b.version, weights)
300        } else {
301            // Legacy binary scoring
302            match (&a.version, &b.version) {
303                (Some(va), Some(vb)) if va == vb => 1.0,
304                (None, None) => 0.5, // Both missing = neutral
305                _ => 0.0,
306            }
307        };
308        result.version_score = version_score;
309        result.total += version_score * weights.version;
310
311        // 3. Ecosystem match (exact match = 1.0, mismatch applies penalty)
312        let (ecosystem_score, ecosystem_penalty) = match (&a.ecosystem, &b.ecosystem) {
313            (Some(ea), Some(eb)) if ea == eb => (1.0, 0.0),
314            (None, None) => (0.5, 0.0), // Both missing = neutral, no penalty
315            (Some(_), Some(_)) => (0.0, weights.ecosystem_mismatch_penalty), // Different ecosystems = penalty
316            _ => (0.0, 0.0), // One missing = no match but no penalty
317        };
318        result.ecosystem_score = ecosystem_score;
319        result.total += ecosystem_score.mul_add(weights.ecosystem, ecosystem_penalty);
320
321        // 4. License overlap (Jaccard similarity on declared licenses)
322        let licenses_a: HashSet<_> = a
323            .licenses
324            .declared
325            .iter()
326            .map(|l| l.expression.as_str())
327            .collect();
328        let licenses_b: HashSet<_> = b
329            .licenses
330            .declared
331            .iter()
332            .map(|l| l.expression.as_str())
333            .collect();
334        let license_score = if licenses_a.is_empty() && licenses_b.is_empty() {
335            0.5 // Both empty = neutral
336        } else if licenses_a.is_empty() || licenses_b.is_empty() {
337            0.0 // One empty = no match
338        } else {
339            let intersection = licenses_a.intersection(&licenses_b).count();
340            let union = licenses_a.union(&licenses_b).count();
341            if union > 0 {
342                intersection as f64 / union as f64
343            } else {
344                0.0
345            }
346        };
347        result.license_score = license_score;
348        result.total += license_score * weights.licenses;
349
350        // 5. Supplier match (exact match on supplier organization name)
351        let supplier_score = match (&a.supplier, &b.supplier) {
352            (Some(sa), Some(sb)) if sa.name.to_lowercase() == sb.name.to_lowercase() => 1.0,
353            (None, None) => 0.5, // Both missing = neutral
354            _ => 0.0,
355        };
356        result.supplier_score = supplier_score;
357        result.total += supplier_score * weights.supplier;
358
359        // 6. Group/namespace match
360        let group_score = match (&a.group, &b.group) {
361            (Some(ga), Some(gb)) if ga.to_lowercase() == gb.to_lowercase() => 1.0,
362            (None, None) => 0.5, // Both missing = neutral
363            _ => 0.0,
364        };
365        result.group_score = group_score;
366        result.total += group_score * weights.group;
367
368        // Clamp total to [0.0, 1.0] after penalty application
369        result.total = result.total.clamp(0.0, 1.0);
370
371        result
372    }
373}
374
375impl Default for FuzzyMatcher {
376    fn default() -> Self {
377        Self::new(FuzzyMatchConfig::balanced())
378    }
379}
380
381impl ComponentMatcher for FuzzyMatcher {
382    fn match_score(&self, a: &Component, b: &Component) -> f64 {
383        self.match_components(a, b)
384    }
385
386    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
387        // Layer 1: Exact PURL match
388        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
389            let norm_a = self.purl_normalizer.normalize(purl_a);
390            let norm_b = self.purl_normalizer.normalize(purl_b);
391            if norm_a == norm_b {
392                return MatchResult::with_metadata(
393                    1.0,
394                    MatchTier::ExactIdentifier,
395                    MatchMetadata {
396                        matched_fields: vec!["purl".to_string()],
397                        normalization: Some("purl_normalized".to_string()),
398                        rule_id: None,
399                    },
400                );
401            }
402        }
403
404        // Layer 2: Alias table lookup
405        if self.check_alias_match(a, b) {
406            return MatchResult::with_metadata(
407                0.95,
408                MatchTier::Alias,
409                MatchMetadata {
410                    matched_fields: vec!["name".to_string()],
411                    normalization: Some("alias_table".to_string()),
412                    rule_id: None,
413                },
414            );
415        }
416
417        // Layer 3: Rule-based ecosystem normalization
418        if let Some(score) = self.check_ecosystem_rules(a, b)
419            && score >= 0.90
420        {
421            return MatchResult::with_metadata(
422                score,
423                MatchTier::EcosystemRule,
424                MatchMetadata {
425                    matched_fields: vec!["name".to_string(), "ecosystem".to_string()],
426                    normalization: Some("ecosystem_rules".to_string()),
427                    rule_id: None,
428                },
429            );
430        }
431
432        // Layer 4: Fuzzy string similarity
433        let fuzzy_score = self.compute_fuzzy_score(a, b);
434        if fuzzy_score >= self.config.threshold {
435            return MatchResult::with_metadata(
436                fuzzy_score,
437                MatchTier::Fuzzy,
438                MatchMetadata {
439                    matched_fields: vec!["name".to_string()],
440                    normalization: Some("fuzzy_similarity".to_string()),
441                    rule_id: None,
442                },
443            );
444        }
445
446        MatchResult::no_match()
447    }
448
449    fn name(&self) -> &'static str {
450        "FuzzyMatcher"
451    }
452
453    fn threshold(&self) -> f64 {
454        self.config.threshold
455    }
456
457    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
458        use strsim::{jaro_winkler, levenshtein};
459
460        // Layer 1: Exact PURL match
461        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
462            let norm_a = self.purl_normalizer.normalize(purl_a);
463            let norm_b = self.purl_normalizer.normalize(purl_b);
464            if norm_a == norm_b {
465                return MatchExplanation::matched(
466                    MatchTier::ExactIdentifier,
467                    1.0,
468                    format!("Exact PURL match: '{purl_a}' equals '{purl_b}' after normalization"),
469                )
470                .with_normalization("purl_normalized");
471            }
472        }
473
474        // Layer 2: Alias table lookup
475        if self.check_alias_match(a, b) {
476            return MatchExplanation::matched(
477                MatchTier::Alias,
478                0.95,
479                format!(
480                    "'{}' and '{}' are known aliases of the same package",
481                    a.name, b.name
482                ),
483            )
484            .with_normalization("alias_table");
485        }
486
487        // Layer 3: Rule-based ecosystem normalization
488        if let Some(score) = self.check_ecosystem_rules(a, b)
489            && score >= 0.90
490        {
491            let ecosystem = a
492                .ecosystem
493                .as_ref()
494                .map_or_else(|| "unknown".to_string(), std::string::ToString::to_string);
495            return MatchExplanation::matched(
496                MatchTier::EcosystemRule,
497                score,
498                format!(
499                    "Names match after {} ecosystem normalization: '{}' -> '{}'",
500                    ecosystem, a.name, b.name
501                ),
502            )
503            .with_normalization(format!("{ecosystem}_normalization"));
504        }
505
506        // Layer 4: Fuzzy string similarity - compute detailed breakdown
507        let name_a = a.name.to_lowercase();
508        let name_b = b.name.to_lowercase();
509
510        let jw_score = jaro_winkler(&name_a, &name_b);
511        let max_len = name_a.len().max(name_b.len());
512        let lev_distance = levenshtein(&name_a, &name_b);
513        let lev_score = if max_len > 0 {
514            1.0 - (lev_distance as f64 / max_len as f64)
515        } else {
516            1.0
517        };
518
519        let jw_weighted = jw_score * self.config.jaro_winkler_weight;
520        let lev_weighted = lev_score * self.config.levenshtein_weight;
521
522        let version_boost = if a.version == b.version && a.version.is_some() {
523            0.05
524        } else {
525            0.0
526        };
527
528        let combined = (jw_weighted + lev_weighted + version_boost).min(1.0);
529
530        let mut explanation = if combined >= self.config.threshold {
531            MatchExplanation::matched(
532                MatchTier::Fuzzy,
533                combined,
534                format!(
535                    "Fuzzy match: '{}' ~ '{}' with {:.0}% similarity",
536                    a.name,
537                    b.name,
538                    combined * 100.0
539                ),
540            )
541        } else {
542            MatchExplanation::no_match(format!(
543                "Fuzzy similarity {:.2} below threshold {:.2}",
544                combined, self.config.threshold
545            ))
546        };
547
548        // Add score breakdown
549        explanation = explanation
550            .with_score_component(ScoreComponent {
551                name: "Jaro-Winkler".to_string(),
552                weight: self.config.jaro_winkler_weight,
553                raw_score: jw_score,
554                weighted_score: jw_weighted,
555                description: format!("'{name_a}' vs '{name_b}' = {jw_score:.2}"),
556            })
557            .with_score_component(ScoreComponent {
558                name: "Levenshtein".to_string(),
559                weight: self.config.levenshtein_weight,
560                raw_score: lev_score,
561                weighted_score: lev_weighted,
562                description: format!(
563                    "edit distance {lev_distance} / max_len {max_len} = {lev_score:.2}"
564                ),
565            });
566
567        if version_boost > 0.0 {
568            explanation = explanation.with_score_component(ScoreComponent {
569                name: "Version boost".to_string(),
570                weight: 1.0,
571                raw_score: version_boost,
572                weighted_score: version_boost,
573                description: format!("versions match: {:?}", a.version),
574            });
575        }
576
577        explanation.with_normalization("lowercase")
578    }
579}
580
581#[cfg(test)]
582mod tests {
583    use super::*;
584
585    #[test]
586    fn test_exact_purl_match() {
587        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
588
589        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
590        a.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
591
592        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
593        b.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
594
595        assert_eq!(matcher.match_components(&a, &b), 1.0);
596    }
597
598    #[test]
599    fn test_fuzzy_name_match() {
600        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
601
602        // Similar names should have some fuzzy match score
603        let a = Component::new("lodash-es".to_string(), "comp-1".to_string());
604        let b = Component::new("lodash".to_string(), "comp-2".to_string());
605
606        let score = matcher.match_components(&a, &b);
607        // With permissive threshold (0.70), similar names should match
608        assert!(
609            score >= 0.70,
610            "lodash-es vs lodash should have score >= 0.70, got {}",
611            score
612        );
613    }
614
615    #[test]
616    fn test_different_names_low_score() {
617        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::strict());
618
619        let a = Component::new("react".to_string(), "comp-1".to_string());
620        let b = Component::new("angular".to_string(), "comp-2".to_string());
621
622        let score = matcher.match_components(&a, &b);
623        assert!(
624            score < 0.5,
625            "react vs angular should have low score, got {}",
626            score
627        );
628    }
629
630    #[test]
631    fn test_multi_field_weights_normalized() {
632        let weights = config::MultiFieldWeights::balanced();
633        assert!(
634            weights.is_normalized(),
635            "Balanced weights should be normalized"
636        );
637
638        let weights = config::MultiFieldWeights::name_focused();
639        assert!(
640            weights.is_normalized(),
641            "Name-focused weights should be normalized"
642        );
643
644        let weights = config::MultiFieldWeights::security_focused();
645        assert!(
646            weights.is_normalized(),
647            "Security-focused weights should be normalized"
648        );
649    }
650
651    #[test]
652    fn test_multi_field_scoring_same_component() {
653        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
654        let weights = config::MultiFieldWeights::balanced();
655
656        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
657        a.version = Some("4.17.21".to_string());
658        a.ecosystem = Some(crate::model::Ecosystem::Npm);
659
660        // Identical component should score very high
661        // Note: empty licenses/supplier/group get neutral 0.5 score, so total won't be 1.0
662        let result = matcher.compute_multi_field_score(&a, &a, &weights);
663        assert!(
664            result.total > 0.90,
665            "Same component should score > 0.90, got {}",
666            result.total
667        );
668        assert_eq!(result.name_score, 1.0);
669        assert_eq!(result.version_score, 1.0);
670        assert_eq!(result.ecosystem_score, 1.0);
671        // Empty fields get neutral 0.5 score
672        assert_eq!(
673            result.license_score, 0.5,
674            "Empty licenses should be neutral"
675        );
676        assert_eq!(
677            result.supplier_score, 0.5,
678            "Empty supplier should be neutral"
679        );
680        assert_eq!(result.group_score, 0.5, "Empty group should be neutral");
681    }
682
683    #[test]
684    fn test_multi_field_scoring_different_versions() {
685        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
686        let weights = config::MultiFieldWeights::balanced();
687
688        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
689        a.version = Some("4.17.21".to_string());
690        a.ecosystem = Some(crate::model::Ecosystem::Npm);
691
692        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
693        b.version = Some("4.17.20".to_string()); // Different patch version
694        b.ecosystem = Some(crate::model::Ecosystem::Npm);
695
696        let result = matcher.compute_multi_field_score(&a, &b, &weights);
697
698        // Name matches perfectly
699        assert!(result.name_score > 0.9, "Name score should be > 0.9");
700
701        // Graduated version scoring: same major.minor gives high score
702        // 4.17.21 vs 4.17.20 = same major.minor, patch diff of 1
703        // Expected: 0.8 - 0.01 * 1 = 0.79
704        assert!(
705            result.version_score > 0.7,
706            "Same major.minor with patch diff should score high, got {}",
707            result.version_score
708        );
709
710        // Ecosystem matches
711        assert_eq!(
712            result.ecosystem_score, 1.0,
713            "Same ecosystem should score 1.0"
714        );
715
716        // Total should be high due to name, ecosystem, and graduated version score
717        assert!(
718            result.total > 0.8,
719            "Total should be > 0.8, got {}",
720            result.total
721        );
722    }
723
724    #[test]
725    fn test_multi_field_scoring_different_major_versions() {
726        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
727        let weights = config::MultiFieldWeights::balanced();
728
729        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
730        a.version = Some("4.17.21".to_string());
731        a.ecosystem = Some(crate::model::Ecosystem::Npm);
732
733        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
734        b.version = Some("3.10.0".to_string()); // Different major version
735        b.ecosystem = Some(crate::model::Ecosystem::Npm);
736
737        let result = matcher.compute_multi_field_score(&a, &b, &weights);
738
739        // Graduated version scoring: different major gives low score
740        // 4 vs 3 = major diff of 1
741        // Expected: 0.3 - 0.10 * 1 = 0.20
742        assert!(
743            result.version_score < 0.3,
744            "Different major versions should score low, got {}",
745            result.version_score
746        );
747    }
748
749    #[test]
750    fn test_multi_field_scoring_legacy_weights() {
751        // Test that legacy weights disable graduated scoring
752        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
753        let weights = config::MultiFieldWeights::legacy();
754
755        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
756        a.version = Some("4.17.21".to_string());
757        a.ecosystem = Some(crate::model::Ecosystem::Npm);
758
759        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
760        b.version = Some("4.17.20".to_string());
761        b.ecosystem = Some(crate::model::Ecosystem::Npm);
762
763        let result = matcher.compute_multi_field_score(&a, &b, &weights);
764
765        // Legacy mode: binary version scoring (exact match or 0)
766        assert_eq!(
767            result.version_score, 0.0,
768            "Legacy mode: different versions should score 0"
769        );
770    }
771
772    #[test]
773    fn test_multi_field_config_preset() {
774        let config = FuzzyMatchConfig::from_preset("balanced-multi").unwrap();
775        assert!(config.field_weights.is_some());
776
777        let config = FuzzyMatchConfig::from_preset("strict_multi").unwrap();
778        assert!(config.field_weights.is_some());
779    }
780
781    #[test]
782    fn test_multi_field_score_result_summary() {
783        let result = MultiFieldScoreResult {
784            total: 0.85,
785            name_score: 1.0,
786            version_score: 0.0,
787            ecosystem_score: 1.0,
788            license_score: 0.5,
789            supplier_score: 0.5,
790            group_score: 0.5,
791        };
792
793        let summary = result.summary();
794        assert!(summary.contains("0.85"));
795        assert!(summary.contains("name: 1.00"));
796    }
797
798    #[test]
799    fn test_token_similarity_exact() {
800        let score = string_similarity::compute_token_similarity("react-dom", "react-dom");
801        assert_eq!(score, 1.0);
802    }
803
804    #[test]
805    fn test_token_similarity_reordered() {
806        // Reordered tokens should have high similarity
807        let score = string_similarity::compute_token_similarity("react-dom", "dom-react");
808        assert_eq!(score, 1.0, "Reordered tokens should match perfectly");
809    }
810
811    #[test]
812    fn test_token_similarity_partial() {
813        // Partial token overlap
814        let score = string_similarity::compute_token_similarity("react-dom-utils", "react-dom");
815        // Jaccard: 2 common / 3 total = 0.667
816        assert!(
817            (score - 0.667).abs() < 0.01,
818            "Partial overlap should be ~0.67, got {}",
819            score
820        );
821    }
822
823    #[test]
824    fn test_token_similarity_different_delimiters() {
825        // Different delimiters should still work
826        let score =
827            string_similarity::compute_token_similarity("my_package_name", "my-package-name");
828        assert_eq!(score, 1.0, "Different delimiters should match");
829    }
830
831    #[test]
832    fn test_token_similarity_no_overlap() {
833        let score = string_similarity::compute_token_similarity("react", "angular");
834        assert_eq!(score, 0.0, "No common tokens should score 0");
835    }
836
837    #[test]
838    fn test_version_similarity_exact() {
839        let v1 = "1.2.3".to_string();
840        let v2 = "1.2.3".to_string();
841        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
842        assert_eq!(score, 0.10, "Exact version match should give max boost");
843    }
844
845    #[test]
846    fn test_version_similarity_same_major_minor() {
847        let v1 = "1.2.3".to_string();
848        let v2 = "1.2.4".to_string();
849        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
850        assert_eq!(score, 0.07, "Same major.minor should give 0.07 boost");
851    }
852
853    #[test]
854    fn test_version_similarity_same_major() {
855        let v1 = "1.2.3".to_string();
856        let v2 = "1.5.0".to_string();
857        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
858        assert_eq!(score, 0.04, "Same major should give 0.04 boost");
859    }
860
861    #[test]
862    fn test_version_similarity_different_major() {
863        let v1 = "1.2.3".to_string();
864        let v2 = "2.0.0".to_string();
865        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
866        assert_eq!(score, 0.0, "Different major versions should give no boost");
867    }
868
869    #[test]
870    fn test_version_similarity_prerelease() {
871        // Handle prerelease versions like "1.2.3-beta"
872        let v1 = "1.2.3-beta".to_string();
873        let v2 = "1.2.4".to_string();
874        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
875        assert_eq!(score, 0.07, "Prerelease should still match major.minor");
876    }
877
878    #[test]
879    fn test_version_similarity_missing() {
880        let v = "1.0.0".to_string();
881        let score = FuzzyMatcher::compute_version_similarity(None, Some(&v));
882        assert_eq!(score, 0.0, "Missing version should give no boost");
883
884        let score = FuzzyMatcher::compute_version_similarity(None, None);
885        assert_eq!(score, 0.0, "Both missing should give no boost");
886    }
887
888    #[test]
889    fn test_fuzzy_match_with_reordered_tokens() {
890        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
891
892        let a = Component::new("react-dom".to_string(), "comp-1".to_string());
893        let b = Component::new("dom-react".to_string(), "comp-2".to_string());
894
895        let score = matcher.match_components(&a, &b);
896        // Token similarity is 1.0, blended with character similarity
897        assert!(
898            score > 0.5,
899            "Reordered names should still match, got {}",
900            score
901        );
902    }
903
904    #[test]
905    fn test_fuzzy_match_version_boost() {
906        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
907
908        // Use slightly different names so we rely on fuzzy matching, not exact match
909        let mut a = Component::new("lodash-utils".to_string(), "comp-1".to_string());
910        a.version = Some("4.17.21".to_string());
911
912        let mut b = Component::new("lodash-util".to_string(), "comp-2".to_string());
913        b.version = Some("4.17.20".to_string()); // Same major.minor -> +0.07 boost
914
915        let mut c = Component::new("lodash-util".to_string(), "comp-3".to_string());
916        c.version = Some("5.0.0".to_string()); // Different major -> +0.0 boost
917
918        let score_same_minor = matcher.match_components(&a, &b);
919        let score_diff_major = matcher.match_components(&a, &c);
920
921        // Both should match (fuzzy), but same_minor should have version boost
922        assert!(score_same_minor > 0.0, "Same minor should match");
923        assert!(score_diff_major > 0.0, "Different major should still match");
924        assert!(
925            score_same_minor > score_diff_major,
926            "Same minor version should score higher: {} vs {}",
927            score_same_minor,
928            score_diff_major
929        );
930    }
931
932    #[test]
933    fn test_soundex_basic() {
934        // Test basic Soundex encoding
935        assert_eq!(string_similarity::soundex("Robert"), "R163");
936        assert_eq!(string_similarity::soundex("Rupert"), "R163"); // Same as Robert
937        assert_eq!(string_similarity::soundex("Smith"), "S530");
938        assert_eq!(string_similarity::soundex("Smyth"), "S530"); // Same as Smith
939    }
940
941    #[test]
942    fn test_soundex_empty() {
943        assert_eq!(string_similarity::soundex(""), "");
944        assert_eq!(string_similarity::soundex("123"), ""); // No letters
945    }
946
947    #[test]
948    fn test_phonetic_similarity_exact() {
949        let score = string_similarity::compute_phonetic_similarity("color", "colour");
950        assert_eq!(score, 1.0, "color and colour should match phonetically");
951    }
952
953    #[test]
954    fn test_phonetic_similarity_different() {
955        let score = string_similarity::compute_phonetic_similarity("react", "angular");
956        assert!(
957            score < 0.5,
958            "Different names should have low phonetic similarity"
959        );
960    }
961
962    #[test]
963    fn test_phonetic_similarity_compound() {
964        // Test compound names where tokens match phonetically
965        let score = string_similarity::compute_phonetic_similarity("json-parser", "jayson-parser");
966        assert!(
967            score > 0.5,
968            "Similar sounding compound names should match: {}",
969            score
970        );
971    }
972
973    #[test]
974    fn test_fuzzy_match_with_phonetic() {
975        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
976
977        let a = Component::new("color-utils".to_string(), "comp-1".to_string());
978        let b = Component::new("colour-utils".to_string(), "comp-2".to_string());
979
980        let score = matcher.match_components(&a, &b);
981        assert!(
982            score > 0.7,
983            "Phonetically similar names should match: {}",
984            score
985        );
986    }
987}
sbom_tools/matching/mod.rs

sbom_tools/matching/
mod.rs