sbom_tools/matching/
mod.rs

1//! Fuzzy matching engine for cross-ecosystem package correlation.
2//!
3//! This module provides multi-tier matching strategies for correlating
4//! components across different ecosystems and naming conventions.
5//!
6//! # Architecture
7//!
8//! The matching system is built on the [`ComponentMatcher`] trait, which
9//! provides a pluggable interface for different matching strategies:
10//!
11//! - [`FuzzyMatcher`]: Multi-tier fuzzy matching (default)
12//! - [`CompositeMatcher`]: Combines multiple matchers
13//! - [`CachedMatcher`]: Wraps any matcher with caching
14//!
15//! # Example
16//!
17//! ```ignore
18//! use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
19//!
20//! // Use the trait for dependency injection
21//! fn diff_with_matcher(matcher: &dyn ComponentMatcher) {
22//!     let score = matcher.match_score(&comp_a, &comp_b);
23//! }
24//!
25//! let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
26//! diff_with_matcher(&matcher);
27//! ```
28
29pub mod adaptive;
30mod aliases;
31mod config;
32pub mod cross_ecosystem;
33pub mod custom_rules;
34pub mod ecosystem_config;
35pub mod index;
36pub mod lsh;
37mod purl;
38pub mod rule_engine;
39mod rules;
40pub mod scoring;
41pub mod string_similarity;
42mod traits;
43
44pub use adaptive::{
45    AdaptiveMatching, AdaptiveMethod, AdaptiveThreshold, AdaptiveThresholdConfig,
46    AdaptiveThresholdResult, ScoreStats,
47};
48pub use aliases::AliasTable;
49pub use config::{CrossEcosystemConfig, FuzzyMatchConfig, MultiFieldWeights};
50pub use cross_ecosystem::{CrossEcosystemDb, CrossEcosystemMatch, PackageFamily};
51pub use custom_rules::{
52    AliasPattern, EquivalenceGroup, ExclusionRule, MatchingRulesConfig, RulePrecedence,
53    RulesSummary,
54};
55pub use ecosystem_config::{
56    ConfigError, CustomEquivalence, CustomRules, EcosystemConfig, EcosystemRulesConfig,
57    GlobalSettings, GroupMigration, ImportMapping, NormalizationConfig, PackageGroup,
58    ScopeHandling, SecurityConfig, TyposquatEntry, VersionSpec, VersioningConfig,
59};
60pub use index::{
61    BatchCandidateConfig, BatchCandidateGenerator, BatchCandidateResult, BatchCandidateStats,
62    ComponentIndex, IndexStats, LazyComponentIndex, NormalizedEntry,
63};
64pub use lsh::{LshConfig, LshIndex, LshIndexStats, MinHashSignature};
65pub use purl::PurlNormalizer;
66pub use rule_engine::{AppliedRule, AppliedRuleType, RuleApplicationResult, RuleEngine};
67pub use rules::EcosystemRules;
68pub use traits::{
69    CacheConfig, CacheStats, CachedMatcher, ComponentMatcher, CompositeMatcher,
70    CompositeMatcherBuilder, MatchExplanation, MatchMetadata, MatchResult, MatchTier,
71    ScoreComponent,
72};
73pub use scoring::MultiFieldScoreResult;
74
75use crate::model::Component;
76use strsim::{jaro_winkler, levenshtein};
77
78/// Fuzzy matcher for component correlation.
79#[must_use]
80pub struct FuzzyMatcher {
81    config: FuzzyMatchConfig,
82    alias_table: AliasTable,
83    purl_normalizer: PurlNormalizer,
84    ecosystem_rules: EcosystemRules,
85}
86
87impl FuzzyMatcher {
88    /// Create a new fuzzy matcher with the given configuration
89    pub fn new(config: FuzzyMatchConfig) -> Self {
90        Self {
91            config,
92            alias_table: AliasTable::default(),
93            purl_normalizer: PurlNormalizer::new(),
94            ecosystem_rules: EcosystemRules::new(),
95        }
96    }
97
98    /// Get the current configuration.
99    #[must_use] 
100    pub const fn config(&self) -> &FuzzyMatchConfig {
101        &self.config
102    }
103
104    /// Create a matcher with a custom alias table
105    pub fn with_alias_table(mut self, table: AliasTable) -> Self {
106        self.alias_table = table;
107        self
108    }
109
110    /// Match two components and return a confidence score (0.0 - 1.0)
111    #[must_use]
112    pub fn match_components(&self, a: &Component, b: &Component) -> f64 {
113        // Layer 1: Exact PURL match
114        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
115            let norm_a = self.purl_normalizer.normalize(purl_a);
116            let norm_b = self.purl_normalizer.normalize(purl_b);
117            if norm_a == norm_b {
118                return 1.0;
119            }
120        }
121
122        // Layer 2: Alias table lookup
123        if self.check_alias_match(a, b) {
124            return 0.95;
125        }
126
127        // Layer 3: Rule-based ecosystem normalization
128        if let Some(score) = self.check_ecosystem_rules(a, b) {
129            if score >= 0.90 {
130                return score;
131            }
132        }
133
134        // Layer 4: Multi-field weighted scoring (if configured) or fuzzy string similarity
135        if let Some(ref weights) = self.config.field_weights {
136            // Use multi-field scoring when configured
137            let result = self.compute_multi_field_score(a, b, weights);
138            if result.total >= self.config.threshold {
139                return result.total;
140            }
141        } else {
142            // Fall back to simple fuzzy string similarity
143            let fuzzy_score = self.compute_fuzzy_score(a, b);
144            if fuzzy_score >= self.config.threshold {
145                return fuzzy_score;
146            }
147        }
148
149        0.0
150    }
151
152    /// Check if components match via alias table
153    fn check_alias_match(&self, a: &Component, b: &Component) -> bool {
154        // Check if either component's name is an alias of the other
155        let names_a = self.get_all_names(a);
156        let names_b = self.get_all_names(b);
157
158        for name_a in &names_a {
159            if let Some(canonical) = self.alias_table.get_canonical(name_a) {
160                for name_b in &names_b {
161                    if self.alias_table.is_alias(&canonical, name_b) {
162                        return true;
163                    }
164                }
165            }
166        }
167
168        false
169    }
170
171    /// Get all possible names for a component
172    fn get_all_names(&self, comp: &Component) -> Vec<String> {
173        let mut names = vec![comp.name.clone()];
174        names.extend(comp.identifiers.aliases.clone());
175
176        // Extract name from PURL if available
177        if let Some(purl) = &comp.identifiers.purl {
178            if let Some(name) = self.extract_name_from_purl(purl) {
179                names.push(name);
180            }
181        }
182
183        names
184    }
185
186    /// Extract the package name from a PURL
187    fn extract_name_from_purl(&self, purl: &str) -> Option<String> {
188        // pkg:type/namespace/name@version?qualifiers#subpath
189        let without_pkg = purl.strip_prefix("pkg:")?;
190        let parts: Vec<&str> = without_pkg.split('/').collect();
191
192        if parts.len() >= 2 {
193            let name_part = parts.last()?;
194            // Remove version and qualifiers
195            let name = name_part.split('@').next()?;
196            Some(name.to_string())
197        } else {
198            None
199        }
200    }
201
202    /// Check ecosystem-specific matching rules
203    fn check_ecosystem_rules(&self, a: &Component, b: &Component) -> Option<f64> {
204        let ecosystem_a = a.ecosystem.as_ref()?;
205        let ecosystem_b = b.ecosystem.as_ref()?;
206
207        // Must be same ecosystem for rule-based matching
208        if ecosystem_a != ecosystem_b {
209            return None;
210        }
211
212        let norm_a = self.ecosystem_rules.normalize_name(&a.name, ecosystem_a);
213        let norm_b = self.ecosystem_rules.normalize_name(&b.name, ecosystem_b);
214
215        if norm_a == norm_b {
216            return Some(0.90);
217        }
218
219        None
220    }
221
222    /// Compute fuzzy string similarity score
223    fn compute_fuzzy_score(&self, a: &Component, b: &Component) -> f64 {
224        let name_a = a.name.to_lowercase();
225        let name_b = b.name.to_lowercase();
226
227        // Compute Jaro-Winkler similarity
228        let jw_score = jaro_winkler(&name_a, &name_b);
229
230        // Compute normalized Levenshtein distance
231        let max_len = name_a.len().max(name_b.len());
232        let lev_distance = levenshtein(&name_a, &name_b);
233        let lev_score = if max_len > 0 {
234            1.0 - (lev_distance as f64 / max_len as f64)
235        } else {
236            1.0
237        };
238
239        // Compute token-based similarity (catches reordered names like "react-dom" vs "dom-react")
240        let token_score = Self::compute_token_similarity(&name_a, &name_b);
241
242        // Compute phonetic similarity (catches typos like "color" vs "colour")
243        let phonetic_score = Self::compute_phonetic_similarity(&name_a, &name_b);
244
245        // Weighted combination of character-based scores
246        let char_score = jw_score.mul_add(self.config.jaro_winkler_weight, lev_score * self.config.levenshtein_weight);
247
248        // Use the MAXIMUM of character, token, and phonetic scores
249        // This allows each method to catch different types of variations
250        let combined = char_score.max(token_score).max(phonetic_score * 0.85);
251
252        // Version-aware boost (semantic version similarity)
253        let version_boost = Self::compute_version_similarity(a.version.as_ref(), b.version.as_ref());
254
255        (combined + version_boost).min(1.0)
256    }
257
258    /// Compute token-based similarity using Jaccard index on name tokens.
259    fn compute_token_similarity(name_a: &str, name_b: &str) -> f64 {
260        string_similarity::compute_token_similarity(name_a, name_b)
261    }
262
263    /// Compute version similarity with semantic awareness.
264    fn compute_version_similarity(va: Option<&String>, vb: Option<&String>) -> f64 {
265        string_similarity::compute_version_similarity(va, vb)
266    }
267
268    /// Compute phonetic similarity using Soundex.
269    #[must_use] 
270    pub fn compute_phonetic_similarity(name_a: &str, name_b: &str) -> f64 {
271        string_similarity::compute_phonetic_similarity(name_a, name_b)
272    }
273
274    /// Compute multi-field weighted score.
275    ///
276    /// Combines scores from multiple component fields based on configured weights.
277    #[must_use] 
278    pub fn compute_multi_field_score(
279        &self,
280        a: &Component,
281        b: &Component,
282        weights: &config::MultiFieldWeights,
283    ) -> scoring::MultiFieldScoreResult {
284        use std::collections::HashSet;
285
286        let mut result = scoring::MultiFieldScoreResult::default();
287
288        // 1. Name similarity (using fuzzy scoring)
289        let name_score = self.compute_fuzzy_score(a, b);
290        result.name_score = name_score;
291        result.total += name_score * weights.name;
292
293        // 2. Version match (graduated or binary scoring)
294        let version_score = if weights.version_divergence_enabled {
295            scoring::compute_version_divergence_score(&a.version, &b.version, weights)
296        } else {
297            // Legacy binary scoring
298            match (&a.version, &b.version) {
299                (Some(va), Some(vb)) if va == vb => 1.0,
300                (None, None) => 0.5, // Both missing = neutral
301                _ => 0.0,
302            }
303        };
304        result.version_score = version_score;
305        result.total += version_score * weights.version;
306
307        // 3. Ecosystem match (exact match = 1.0, mismatch applies penalty)
308        let (ecosystem_score, ecosystem_penalty) = match (&a.ecosystem, &b.ecosystem) {
309            (Some(ea), Some(eb)) if ea == eb => (1.0, 0.0),
310            (None, None) => (0.5, 0.0), // Both missing = neutral, no penalty
311            (Some(_), Some(_)) => (0.0, weights.ecosystem_mismatch_penalty), // Different ecosystems = penalty
312            _ => (0.0, 0.0), // One missing = no match but no penalty
313        };
314        result.ecosystem_score = ecosystem_score;
315        result.total += ecosystem_score.mul_add(weights.ecosystem, ecosystem_penalty);
316
317        // 4. License overlap (Jaccard similarity on declared licenses)
318        let licenses_a: HashSet<_> = a
319            .licenses
320            .declared
321            .iter()
322            .map(|l| l.expression.as_str())
323            .collect();
324        let licenses_b: HashSet<_> = b
325            .licenses
326            .declared
327            .iter()
328            .map(|l| l.expression.as_str())
329            .collect();
330        let license_score = if licenses_a.is_empty() && licenses_b.is_empty() {
331            0.5 // Both empty = neutral
332        } else if licenses_a.is_empty() || licenses_b.is_empty() {
333            0.0 // One empty = no match
334        } else {
335            let intersection = licenses_a.intersection(&licenses_b).count();
336            let union = licenses_a.union(&licenses_b).count();
337            if union > 0 {
338                intersection as f64 / union as f64
339            } else {
340                0.0
341            }
342        };
343        result.license_score = license_score;
344        result.total += license_score * weights.licenses;
345
346        // 5. Supplier match (exact match on supplier organization name)
347        let supplier_score = match (&a.supplier, &b.supplier) {
348            (Some(sa), Some(sb)) if sa.name.to_lowercase() == sb.name.to_lowercase() => 1.0,
349            (None, None) => 0.5, // Both missing = neutral
350            _ => 0.0,
351        };
352        result.supplier_score = supplier_score;
353        result.total += supplier_score * weights.supplier;
354
355        // 6. Group/namespace match
356        let group_score = match (&a.group, &b.group) {
357            (Some(ga), Some(gb)) if ga.to_lowercase() == gb.to_lowercase() => 1.0,
358            (None, None) => 0.5, // Both missing = neutral
359            _ => 0.0,
360        };
361        result.group_score = group_score;
362        result.total += group_score * weights.group;
363
364        // Clamp total to [0.0, 1.0] after penalty application
365        result.total = result.total.clamp(0.0, 1.0);
366
367        result
368    }
369}
370
371impl Default for FuzzyMatcher {
372    fn default() -> Self {
373        Self::new(FuzzyMatchConfig::balanced())
374    }
375}
376
377impl ComponentMatcher for FuzzyMatcher {
378    fn match_score(&self, a: &Component, b: &Component) -> f64 {
379        self.match_components(a, b)
380    }
381
382    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
383        // Layer 1: Exact PURL match
384        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
385            let norm_a = self.purl_normalizer.normalize(purl_a);
386            let norm_b = self.purl_normalizer.normalize(purl_b);
387            if norm_a == norm_b {
388                return MatchResult::with_metadata(
389                    1.0,
390                    MatchTier::ExactIdentifier,
391                    MatchMetadata {
392                        matched_fields: vec!["purl".to_string()],
393                        normalization: Some("purl_normalized".to_string()),
394                        rule_id: None,
395                    },
396                );
397            }
398        }
399
400        // Layer 2: Alias table lookup
401        if self.check_alias_match(a, b) {
402            return MatchResult::with_metadata(
403                0.95,
404                MatchTier::Alias,
405                MatchMetadata {
406                    matched_fields: vec!["name".to_string()],
407                    normalization: Some("alias_table".to_string()),
408                    rule_id: None,
409                },
410            );
411        }
412
413        // Layer 3: Rule-based ecosystem normalization
414        if let Some(score) = self.check_ecosystem_rules(a, b) {
415            if score >= 0.90 {
416                return MatchResult::with_metadata(
417                    score,
418                    MatchTier::EcosystemRule,
419                    MatchMetadata {
420                        matched_fields: vec!["name".to_string(), "ecosystem".to_string()],
421                        normalization: Some("ecosystem_rules".to_string()),
422                        rule_id: None,
423                    },
424                );
425            }
426        }
427
428        // Layer 4: Fuzzy string similarity
429        let fuzzy_score = self.compute_fuzzy_score(a, b);
430        if fuzzy_score >= self.config.threshold {
431            return MatchResult::with_metadata(
432                fuzzy_score,
433                MatchTier::Fuzzy,
434                MatchMetadata {
435                    matched_fields: vec!["name".to_string()],
436                    normalization: Some("fuzzy_similarity".to_string()),
437                    rule_id: None,
438                },
439            );
440        }
441
442        MatchResult::no_match()
443    }
444
445    fn name(&self) -> &'static str {
446        "FuzzyMatcher"
447    }
448
449    fn threshold(&self) -> f64 {
450        self.config.threshold
451    }
452
453    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
454        use strsim::{jaro_winkler, levenshtein};
455
456        // Layer 1: Exact PURL match
457        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
458            let norm_a = self.purl_normalizer.normalize(purl_a);
459            let norm_b = self.purl_normalizer.normalize(purl_b);
460            if norm_a == norm_b {
461                return MatchExplanation::matched(
462                    MatchTier::ExactIdentifier,
463                    1.0,
464                    format!(
465                        "Exact PURL match: '{purl_a}' equals '{purl_b}' after normalization"
466                    ),
467                )
468                .with_normalization("purl_normalized");
469            }
470        }
471
472        // Layer 2: Alias table lookup
473        if self.check_alias_match(a, b) {
474            return MatchExplanation::matched(
475                MatchTier::Alias,
476                0.95,
477                format!(
478                    "'{}' and '{}' are known aliases of the same package",
479                    a.name, b.name
480                ),
481            )
482            .with_normalization("alias_table");
483        }
484
485        // Layer 3: Rule-based ecosystem normalization
486        if let Some(score) = self.check_ecosystem_rules(a, b) {
487            if score >= 0.90 {
488                let ecosystem = a
489                    .ecosystem
490                    .as_ref().map_or_else(|| "unknown".to_string(), std::string::ToString::to_string);
491                return MatchExplanation::matched(
492                    MatchTier::EcosystemRule,
493                    score,
494                    format!(
495                        "Names match after {} ecosystem normalization: '{}' -> '{}'",
496                        ecosystem, a.name, b.name
497                    ),
498                )
499                .with_normalization(format!("{ecosystem}_normalization"));
500            }
501        }
502
503        // Layer 4: Fuzzy string similarity - compute detailed breakdown
504        let name_a = a.name.to_lowercase();
505        let name_b = b.name.to_lowercase();
506
507        let jw_score = jaro_winkler(&name_a, &name_b);
508        let max_len = name_a.len().max(name_b.len());
509        let lev_distance = levenshtein(&name_a, &name_b);
510        let lev_score = if max_len > 0 {
511            1.0 - (lev_distance as f64 / max_len as f64)
512        } else {
513            1.0
514        };
515
516        let jw_weighted = jw_score * self.config.jaro_winkler_weight;
517        let lev_weighted = lev_score * self.config.levenshtein_weight;
518
519        let version_boost = if a.version == b.version && a.version.is_some() {
520            0.05
521        } else {
522            0.0
523        };
524
525        let combined = (jw_weighted + lev_weighted + version_boost).min(1.0);
526
527        let mut explanation = if combined >= self.config.threshold {
528            MatchExplanation::matched(
529                MatchTier::Fuzzy,
530                combined,
531                format!(
532                    "Fuzzy match: '{}' ~ '{}' with {:.0}% similarity",
533                    a.name,
534                    b.name,
535                    combined * 100.0
536                ),
537            )
538        } else {
539            MatchExplanation::no_match(format!(
540                "Fuzzy similarity {:.2} below threshold {:.2}",
541                combined, self.config.threshold
542            ))
543        };
544
545        // Add score breakdown
546        explanation = explanation
547            .with_score_component(ScoreComponent {
548                name: "Jaro-Winkler".to_string(),
549                weight: self.config.jaro_winkler_weight,
550                raw_score: jw_score,
551                weighted_score: jw_weighted,
552                description: format!("'{name_a}' vs '{name_b}' = {jw_score:.2}"),
553            })
554            .with_score_component(ScoreComponent {
555                name: "Levenshtein".to_string(),
556                weight: self.config.levenshtein_weight,
557                raw_score: lev_score,
558                weighted_score: lev_weighted,
559                description: format!(
560                    "edit distance {lev_distance} / max_len {max_len} = {lev_score:.2}"
561                ),
562            });
563
564        if version_boost > 0.0 {
565            explanation = explanation.with_score_component(ScoreComponent {
566                name: "Version boost".to_string(),
567                weight: 1.0,
568                raw_score: version_boost,
569                weighted_score: version_boost,
570                description: format!("versions match: {:?}", a.version),
571            });
572        }
573
574        explanation.with_normalization("lowercase")
575    }
576}
577
578#[cfg(test)]
579mod tests {
580    use super::*;
581
582    #[test]
583    fn test_exact_purl_match() {
584        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
585
586        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
587        a.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
588
589        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
590        b.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
591
592        assert_eq!(matcher.match_components(&a, &b), 1.0);
593    }
594
595    #[test]
596    fn test_fuzzy_name_match() {
597        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
598
599        // Similar names should have some fuzzy match score
600        let a = Component::new("lodash-es".to_string(), "comp-1".to_string());
601        let b = Component::new("lodash".to_string(), "comp-2".to_string());
602
603        let score = matcher.match_components(&a, &b);
604        // With permissive threshold (0.70), similar names should match
605        assert!(
606            score >= 0.70,
607            "lodash-es vs lodash should have score >= 0.70, got {}",
608            score
609        );
610    }
611
612    #[test]
613    fn test_different_names_low_score() {
614        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::strict());
615
616        let a = Component::new("react".to_string(), "comp-1".to_string());
617        let b = Component::new("angular".to_string(), "comp-2".to_string());
618
619        let score = matcher.match_components(&a, &b);
620        assert!(
621            score < 0.5,
622            "react vs angular should have low score, got {}",
623            score
624        );
625    }
626
627    #[test]
628    fn test_multi_field_weights_normalized() {
629        let weights = config::MultiFieldWeights::balanced();
630        assert!(
631            weights.is_normalized(),
632            "Balanced weights should be normalized"
633        );
634
635        let weights = config::MultiFieldWeights::name_focused();
636        assert!(
637            weights.is_normalized(),
638            "Name-focused weights should be normalized"
639        );
640
641        let weights = config::MultiFieldWeights::security_focused();
642        assert!(
643            weights.is_normalized(),
644            "Security-focused weights should be normalized"
645        );
646    }
647
648    #[test]
649    fn test_multi_field_scoring_same_component() {
650        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
651        let weights = config::MultiFieldWeights::balanced();
652
653        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
654        a.version = Some("4.17.21".to_string());
655        a.ecosystem = Some(crate::model::Ecosystem::Npm);
656
657        // Identical component should score very high
658        // Note: empty licenses/supplier/group get neutral 0.5 score, so total won't be 1.0
659        let result = matcher.compute_multi_field_score(&a, &a, &weights);
660        assert!(
661            result.total > 0.90,
662            "Same component should score > 0.90, got {}",
663            result.total
664        );
665        assert_eq!(result.name_score, 1.0);
666        assert_eq!(result.version_score, 1.0);
667        assert_eq!(result.ecosystem_score, 1.0);
668        // Empty fields get neutral 0.5 score
669        assert_eq!(
670            result.license_score, 0.5,
671            "Empty licenses should be neutral"
672        );
673        assert_eq!(
674            result.supplier_score, 0.5,
675            "Empty supplier should be neutral"
676        );
677        assert_eq!(result.group_score, 0.5, "Empty group should be neutral");
678    }
679
680    #[test]
681    fn test_multi_field_scoring_different_versions() {
682        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
683        let weights = config::MultiFieldWeights::balanced();
684
685        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
686        a.version = Some("4.17.21".to_string());
687        a.ecosystem = Some(crate::model::Ecosystem::Npm);
688
689        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
690        b.version = Some("4.17.20".to_string()); // Different patch version
691        b.ecosystem = Some(crate::model::Ecosystem::Npm);
692
693        let result = matcher.compute_multi_field_score(&a, &b, &weights);
694
695        // Name matches perfectly
696        assert!(result.name_score > 0.9, "Name score should be > 0.9");
697
698        // Graduated version scoring: same major.minor gives high score
699        // 4.17.21 vs 4.17.20 = same major.minor, patch diff of 1
700        // Expected: 0.8 - 0.01 * 1 = 0.79
701        assert!(
702            result.version_score > 0.7,
703            "Same major.minor with patch diff should score high, got {}",
704            result.version_score
705        );
706
707        // Ecosystem matches
708        assert_eq!(
709            result.ecosystem_score, 1.0,
710            "Same ecosystem should score 1.0"
711        );
712
713        // Total should be high due to name, ecosystem, and graduated version score
714        assert!(
715            result.total > 0.8,
716            "Total should be > 0.8, got {}",
717            result.total
718        );
719    }
720
721    #[test]
722    fn test_multi_field_scoring_different_major_versions() {
723        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
724        let weights = config::MultiFieldWeights::balanced();
725
726        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
727        a.version = Some("4.17.21".to_string());
728        a.ecosystem = Some(crate::model::Ecosystem::Npm);
729
730        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
731        b.version = Some("3.10.0".to_string()); // Different major version
732        b.ecosystem = Some(crate::model::Ecosystem::Npm);
733
734        let result = matcher.compute_multi_field_score(&a, &b, &weights);
735
736        // Graduated version scoring: different major gives low score
737        // 4 vs 3 = major diff of 1
738        // Expected: 0.3 - 0.10 * 1 = 0.20
739        assert!(
740            result.version_score < 0.3,
741            "Different major versions should score low, got {}",
742            result.version_score
743        );
744    }
745
746    #[test]
747    fn test_multi_field_scoring_legacy_weights() {
748        // Test that legacy weights disable graduated scoring
749        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
750        let weights = config::MultiFieldWeights::legacy();
751
752        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
753        a.version = Some("4.17.21".to_string());
754        a.ecosystem = Some(crate::model::Ecosystem::Npm);
755
756        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
757        b.version = Some("4.17.20".to_string());
758        b.ecosystem = Some(crate::model::Ecosystem::Npm);
759
760        let result = matcher.compute_multi_field_score(&a, &b, &weights);
761
762        // Legacy mode: binary version scoring (exact match or 0)
763        assert_eq!(
764            result.version_score, 0.0,
765            "Legacy mode: different versions should score 0"
766        );
767    }
768
769    #[test]
770    fn test_multi_field_config_preset() {
771        let config = FuzzyMatchConfig::from_preset("balanced-multi").unwrap();
772        assert!(config.field_weights.is_some());
773
774        let config = FuzzyMatchConfig::from_preset("strict_multi").unwrap();
775        assert!(config.field_weights.is_some());
776    }
777
778    #[test]
779    fn test_multi_field_score_result_summary() {
780        let result = MultiFieldScoreResult {
781            total: 0.85,
782            name_score: 1.0,
783            version_score: 0.0,
784            ecosystem_score: 1.0,
785            license_score: 0.5,
786            supplier_score: 0.5,
787            group_score: 0.5,
788        };
789
790        let summary = result.summary();
791        assert!(summary.contains("0.85"));
792        assert!(summary.contains("name: 1.00"));
793    }
794
795    #[test]
796    fn test_token_similarity_exact() {
797        let score = string_similarity::compute_token_similarity("react-dom", "react-dom");
798        assert_eq!(score, 1.0);
799    }
800
801    #[test]
802    fn test_token_similarity_reordered() {
803        // Reordered tokens should have high similarity
804        let score = string_similarity::compute_token_similarity("react-dom", "dom-react");
805        assert_eq!(score, 1.0, "Reordered tokens should match perfectly");
806    }
807
808    #[test]
809    fn test_token_similarity_partial() {
810        // Partial token overlap
811        let score = string_similarity::compute_token_similarity("react-dom-utils", "react-dom");
812        // Jaccard: 2 common / 3 total = 0.667
813        assert!(
814            (score - 0.667).abs() < 0.01,
815            "Partial overlap should be ~0.67, got {}",
816            score
817        );
818    }
819
820    #[test]
821    fn test_token_similarity_different_delimiters() {
822        // Different delimiters should still work
823        let score = string_similarity::compute_token_similarity("my_package_name", "my-package-name");
824        assert_eq!(score, 1.0, "Different delimiters should match");
825    }
826
827    #[test]
828    fn test_token_similarity_no_overlap() {
829        let score = string_similarity::compute_token_similarity("react", "angular");
830        assert_eq!(score, 0.0, "No common tokens should score 0");
831    }
832
833    #[test]
834    fn test_version_similarity_exact() {
835        let v1 = "1.2.3".to_string();
836        let v2 = "1.2.3".to_string();
837        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
838        assert_eq!(score, 0.10, "Exact version match should give max boost");
839    }
840
841    #[test]
842    fn test_version_similarity_same_major_minor() {
843        let v1 = "1.2.3".to_string();
844        let v2 = "1.2.4".to_string();
845        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
846        assert_eq!(score, 0.07, "Same major.minor should give 0.07 boost");
847    }
848
849    #[test]
850    fn test_version_similarity_same_major() {
851        let v1 = "1.2.3".to_string();
852        let v2 = "1.5.0".to_string();
853        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
854        assert_eq!(score, 0.04, "Same major should give 0.04 boost");
855    }
856
857    #[test]
858    fn test_version_similarity_different_major() {
859        let v1 = "1.2.3".to_string();
860        let v2 = "2.0.0".to_string();
861        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
862        assert_eq!(score, 0.0, "Different major versions should give no boost");
863    }
864
865    #[test]
866    fn test_version_similarity_prerelease() {
867        // Handle prerelease versions like "1.2.3-beta"
868        let v1 = "1.2.3-beta".to_string();
869        let v2 = "1.2.4".to_string();
870        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
871        assert_eq!(score, 0.07, "Prerelease should still match major.minor");
872    }
873
874    #[test]
875    fn test_version_similarity_missing() {
876        let v = "1.0.0".to_string();
877        let score = FuzzyMatcher::compute_version_similarity(None, Some(&v));
878        assert_eq!(score, 0.0, "Missing version should give no boost");
879
880        let score = FuzzyMatcher::compute_version_similarity(None, None);
881        assert_eq!(score, 0.0, "Both missing should give no boost");
882    }
883
884    #[test]
885    fn test_fuzzy_match_with_reordered_tokens() {
886        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
887
888        let a = Component::new("react-dom".to_string(), "comp-1".to_string());
889        let b = Component::new("dom-react".to_string(), "comp-2".to_string());
890
891        let score = matcher.match_components(&a, &b);
892        // Token similarity is 1.0, blended with character similarity
893        assert!(
894            score > 0.5,
895            "Reordered names should still match, got {}",
896            score
897        );
898    }
899
900    #[test]
901    fn test_fuzzy_match_version_boost() {
902        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
903
904        // Use slightly different names so we rely on fuzzy matching, not exact match
905        let mut a = Component::new("lodash-utils".to_string(), "comp-1".to_string());
906        a.version = Some("4.17.21".to_string());
907
908        let mut b = Component::new("lodash-util".to_string(), "comp-2".to_string());
909        b.version = Some("4.17.20".to_string()); // Same major.minor -> +0.07 boost
910
911        let mut c = Component::new("lodash-util".to_string(), "comp-3".to_string());
912        c.version = Some("5.0.0".to_string()); // Different major -> +0.0 boost
913
914        let score_same_minor = matcher.match_components(&a, &b);
915        let score_diff_major = matcher.match_components(&a, &c);
916
917        // Both should match (fuzzy), but same_minor should have version boost
918        assert!(score_same_minor > 0.0, "Same minor should match");
919        assert!(score_diff_major > 0.0, "Different major should still match");
920        assert!(
921            score_same_minor > score_diff_major,
922            "Same minor version should score higher: {} vs {}",
923            score_same_minor,
924            score_diff_major
925        );
926    }
927
928    #[test]
929    fn test_soundex_basic() {
930        // Test basic Soundex encoding
931        assert_eq!(string_similarity::soundex("Robert"), "R163");
932        assert_eq!(string_similarity::soundex("Rupert"), "R163"); // Same as Robert
933        assert_eq!(string_similarity::soundex("Smith"), "S530");
934        assert_eq!(string_similarity::soundex("Smyth"), "S530"); // Same as Smith
935    }
936
937    #[test]
938    fn test_soundex_empty() {
939        assert_eq!(string_similarity::soundex(""), "");
940        assert_eq!(string_similarity::soundex("123"), ""); // No letters
941    }
942
943    #[test]
944    fn test_phonetic_similarity_exact() {
945        let score = string_similarity::compute_phonetic_similarity("color", "colour");
946        assert_eq!(score, 1.0, "color and colour should match phonetically");
947    }
948
949    #[test]
950    fn test_phonetic_similarity_different() {
951        let score = string_similarity::compute_phonetic_similarity("react", "angular");
952        assert!(
953            score < 0.5,
954            "Different names should have low phonetic similarity"
955        );
956    }
957
958    #[test]
959    fn test_phonetic_similarity_compound() {
960        // Test compound names where tokens match phonetically
961        let score = string_similarity::compute_phonetic_similarity("json-parser", "jayson-parser");
962        assert!(
963            score > 0.5,
964            "Similar sounding compound names should match: {}",
965            score
966        );
967    }
968
969    #[test]
970    fn test_fuzzy_match_with_phonetic() {
971        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
972
973        let a = Component::new("color-utils".to_string(), "comp-1".to_string());
974        let b = Component::new("colour-utils".to_string(), "comp-2".to_string());
975
976        let score = matcher.match_components(&a, &b);
977        assert!(
978            score > 0.7,
979            "Phonetically similar names should match: {}",
980            score
981        );
982    }
983}
sbom_tools/matching/mod.rs

sbom_tools/matching/
mod.rs