sbom_tools/matching/
mod.rs

1//! Fuzzy matching engine for cross-ecosystem package correlation.
2//!
3//! This module provides multi-tier matching strategies for correlating
4//! components across different ecosystems and naming conventions.
5//!
6//! # Architecture
7//!
8//! The matching system is built on the [`ComponentMatcher`] trait, which
9//! provides a pluggable interface for different matching strategies:
10//!
11//! - [`FuzzyMatcher`]: Multi-tier fuzzy matching (default)
12//! - [`CompositeMatcher`]: Combines multiple matchers
13//! - [`CachedMatcher`]: Wraps any matcher with caching
14//!
15//! # Example
16//!
17//! ```ignore
18//! use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
19//!
20//! // Use the trait for dependency injection
21//! fn diff_with_matcher(matcher: &dyn ComponentMatcher) {
22//!     let score = matcher.match_score(&comp_a, &comp_b);
23//! }
24//!
25//! let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
26//! diff_with_matcher(&matcher);
27//! ```
28
29pub mod adaptive;
30mod aliases;
31mod config;
32pub mod cross_ecosystem;
33pub mod custom_rules;
34pub mod ecosystem_config;
35pub mod index;
36pub mod lsh;
37mod purl;
38pub mod rule_engine;
39mod rules;
40pub mod scoring;
41pub mod string_similarity;
42mod traits;
43
44pub use adaptive::{
45    AdaptiveMatching, AdaptiveMethod, AdaptiveThreshold, AdaptiveThresholdConfig,
46    AdaptiveThresholdResult, ScoreStats,
47};
48pub use aliases::AliasTable;
49pub use config::{CrossEcosystemConfig, FuzzyMatchConfig, MultiFieldWeights};
50pub use cross_ecosystem::{CrossEcosystemDb, CrossEcosystemMatch, PackageFamily};
51pub use custom_rules::{
52    AliasPattern, EquivalenceGroup, ExclusionRule, MatchingRulesConfig, RulePrecedence,
53    RulesSummary,
54};
55pub use ecosystem_config::{
56    ConfigError, CustomEquivalence, CustomRules, EcosystemConfig, EcosystemRulesConfig,
57    GlobalSettings, GroupMigration, ImportMapping, NormalizationConfig, PackageGroup,
58    ScopeHandling, SecurityConfig, TyposquatEntry, VersionSpec, VersioningConfig,
59};
60pub use index::{
61    BatchCandidateConfig, BatchCandidateGenerator, BatchCandidateResult, BatchCandidateStats,
62    ComponentIndex, IndexStats, LazyComponentIndex, NormalizedEntry,
63};
64pub use lsh::{LshConfig, LshIndex, LshIndexStats, MinHashSignature};
65pub use purl::PurlNormalizer;
66pub use rule_engine::{AppliedRule, AppliedRuleType, RuleApplicationResult, RuleEngine};
67pub use rules::EcosystemRules;
68pub use traits::{
69    CacheConfig, CacheStats, CachedMatcher, ComponentMatcher, CompositeMatcher,
70    CompositeMatcherBuilder, MatchExplanation, MatchMetadata, MatchResult, MatchTier,
71    ScoreComponent,
72};
73pub use scoring::MultiFieldScoreResult;
74
75use crate::model::Component;
76use strsim::{jaro_winkler, levenshtein};
77
78/// Fuzzy matcher for component correlation.
79#[must_use]
80pub struct FuzzyMatcher {
81    config: FuzzyMatchConfig,
82    alias_table: AliasTable,
83    purl_normalizer: PurlNormalizer,
84    ecosystem_rules: EcosystemRules,
85}
86
87impl FuzzyMatcher {
88    /// Create a new fuzzy matcher with the given configuration
89    pub fn new(config: FuzzyMatchConfig) -> Self {
90        Self {
91            config,
92            alias_table: AliasTable::default(),
93            purl_normalizer: PurlNormalizer::new(),
94            ecosystem_rules: EcosystemRules::new(),
95        }
96    }
97
98    /// Get the current configuration.
99    #[must_use] 
100    pub const fn config(&self) -> &FuzzyMatchConfig {
101        &self.config
102    }
103
104    /// Create a matcher with a custom alias table
105    pub fn with_alias_table(mut self, table: AliasTable) -> Self {
106        self.alias_table = table;
107        self
108    }
109
110    /// Match two components and return a confidence score (0.0 - 1.0)
111    #[must_use]
112    pub fn match_components(&self, a: &Component, b: &Component) -> f64 {
113        // Layer 1: Exact PURL match
114        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
115            let norm_a = self.purl_normalizer.normalize(purl_a);
116            let norm_b = self.purl_normalizer.normalize(purl_b);
117            if norm_a == norm_b {
118                return 1.0;
119            }
120        }
121
122        // Layer 2: Alias table lookup
123        if self.check_alias_match(a, b) {
124            return 0.95;
125        }
126
127        // Layer 3: Rule-based ecosystem normalization
128        if let Some(score) = self.check_ecosystem_rules(a, b)
129            && score >= 0.90 {
130                return score;
131            }
132
133        // Layer 4: Multi-field weighted scoring (if configured) or fuzzy string similarity
134        if let Some(ref weights) = self.config.field_weights {
135            // Use multi-field scoring when configured
136            let result = self.compute_multi_field_score(a, b, weights);
137            if result.total >= self.config.threshold {
138                return result.total;
139            }
140        } else {
141            // Fall back to simple fuzzy string similarity
142            let fuzzy_score = self.compute_fuzzy_score(a, b);
143            if fuzzy_score >= self.config.threshold {
144                return fuzzy_score;
145            }
146        }
147
148        0.0
149    }
150
151    /// Check if components match via alias table
152    fn check_alias_match(&self, a: &Component, b: &Component) -> bool {
153        // Check if either component's name is an alias of the other
154        let names_a = self.get_all_names(a);
155        let names_b = self.get_all_names(b);
156
157        for name_a in &names_a {
158            if let Some(canonical) = self.alias_table.get_canonical(name_a) {
159                for name_b in &names_b {
160                    if self.alias_table.is_alias(&canonical, name_b) {
161                        return true;
162                    }
163                }
164            }
165        }
166
167        false
168    }
169
170    /// Get all possible names for a component
171    fn get_all_names(&self, comp: &Component) -> Vec<String> {
172        let mut names = vec![comp.name.clone()];
173        names.extend(comp.identifiers.aliases.clone());
174
175        // Extract name from PURL if available
176        if let Some(purl) = &comp.identifiers.purl
177            && let Some(name) = self.extract_name_from_purl(purl) {
178                names.push(name);
179            }
180
181        names
182    }
183
184    /// Extract the package name from a PURL
185    fn extract_name_from_purl(&self, purl: &str) -> Option<String> {
186        // pkg:type/namespace/name@version?qualifiers#subpath
187        let without_pkg = purl.strip_prefix("pkg:")?;
188        let parts: Vec<&str> = without_pkg.split('/').collect();
189
190        if parts.len() >= 2 {
191            let name_part = parts.last()?;
192            // Remove version and qualifiers
193            let name = name_part.split('@').next()?;
194            Some(name.to_string())
195        } else {
196            None
197        }
198    }
199
200    /// Check ecosystem-specific matching rules
201    fn check_ecosystem_rules(&self, a: &Component, b: &Component) -> Option<f64> {
202        let ecosystem_a = a.ecosystem.as_ref()?;
203        let ecosystem_b = b.ecosystem.as_ref()?;
204
205        // Must be same ecosystem for rule-based matching
206        if ecosystem_a != ecosystem_b {
207            return None;
208        }
209
210        let norm_a = self.ecosystem_rules.normalize_name(&a.name, ecosystem_a);
211        let norm_b = self.ecosystem_rules.normalize_name(&b.name, ecosystem_b);
212
213        if norm_a == norm_b {
214            return Some(0.90);
215        }
216
217        None
218    }
219
220    /// Compute fuzzy string similarity score
221    fn compute_fuzzy_score(&self, a: &Component, b: &Component) -> f64 {
222        let name_a = a.name.to_lowercase();
223        let name_b = b.name.to_lowercase();
224
225        // Compute Jaro-Winkler similarity
226        let jw_score = jaro_winkler(&name_a, &name_b);
227
228        // Compute normalized Levenshtein distance
229        let max_len = name_a.len().max(name_b.len());
230        let lev_distance = levenshtein(&name_a, &name_b);
231        let lev_score = if max_len > 0 {
232            1.0 - (lev_distance as f64 / max_len as f64)
233        } else {
234            1.0
235        };
236
237        // Compute token-based similarity (catches reordered names like "react-dom" vs "dom-react")
238        let token_score = Self::compute_token_similarity(&name_a, &name_b);
239
240        // Compute phonetic similarity (catches typos like "color" vs "colour")
241        let phonetic_score = Self::compute_phonetic_similarity(&name_a, &name_b);
242
243        // Weighted combination of character-based scores
244        let char_score = jw_score.mul_add(self.config.jaro_winkler_weight, lev_score * self.config.levenshtein_weight);
245
246        // Use the MAXIMUM of character, token, and phonetic scores
247        // This allows each method to catch different types of variations
248        let combined = char_score.max(token_score).max(phonetic_score * 0.85);
249
250        // Version-aware boost (semantic version similarity)
251        let version_boost = Self::compute_version_similarity(a.version.as_ref(), b.version.as_ref());
252
253        (combined + version_boost).min(1.0)
254    }
255
256    /// Compute token-based similarity using Jaccard index on name tokens.
257    fn compute_token_similarity(name_a: &str, name_b: &str) -> f64 {
258        string_similarity::compute_token_similarity(name_a, name_b)
259    }
260
261    /// Compute version similarity with semantic awareness.
262    fn compute_version_similarity(va: Option<&String>, vb: Option<&String>) -> f64 {
263        string_similarity::compute_version_similarity(va, vb)
264    }
265
266    /// Compute phonetic similarity using Soundex.
267    #[must_use] 
268    pub fn compute_phonetic_similarity(name_a: &str, name_b: &str) -> f64 {
269        string_similarity::compute_phonetic_similarity(name_a, name_b)
270    }
271
272    /// Compute multi-field weighted score.
273    ///
274    /// Combines scores from multiple component fields based on configured weights.
275    #[must_use] 
276    pub fn compute_multi_field_score(
277        &self,
278        a: &Component,
279        b: &Component,
280        weights: &config::MultiFieldWeights,
281    ) -> scoring::MultiFieldScoreResult {
282        use std::collections::HashSet;
283
284        let mut result = scoring::MultiFieldScoreResult::default();
285
286        // 1. Name similarity (using fuzzy scoring)
287        let name_score = self.compute_fuzzy_score(a, b);
288        result.name_score = name_score;
289        result.total += name_score * weights.name;
290
291        // 2. Version match (graduated or binary scoring)
292        let version_score = if weights.version_divergence_enabled {
293            scoring::compute_version_divergence_score(&a.version, &b.version, weights)
294        } else {
295            // Legacy binary scoring
296            match (&a.version, &b.version) {
297                (Some(va), Some(vb)) if va == vb => 1.0,
298                (None, None) => 0.5, // Both missing = neutral
299                _ => 0.0,
300            }
301        };
302        result.version_score = version_score;
303        result.total += version_score * weights.version;
304
305        // 3. Ecosystem match (exact match = 1.0, mismatch applies penalty)
306        let (ecosystem_score, ecosystem_penalty) = match (&a.ecosystem, &b.ecosystem) {
307            (Some(ea), Some(eb)) if ea == eb => (1.0, 0.0),
308            (None, None) => (0.5, 0.0), // Both missing = neutral, no penalty
309            (Some(_), Some(_)) => (0.0, weights.ecosystem_mismatch_penalty), // Different ecosystems = penalty
310            _ => (0.0, 0.0), // One missing = no match but no penalty
311        };
312        result.ecosystem_score = ecosystem_score;
313        result.total += ecosystem_score.mul_add(weights.ecosystem, ecosystem_penalty);
314
315        // 4. License overlap (Jaccard similarity on declared licenses)
316        let licenses_a: HashSet<_> = a
317            .licenses
318            .declared
319            .iter()
320            .map(|l| l.expression.as_str())
321            .collect();
322        let licenses_b: HashSet<_> = b
323            .licenses
324            .declared
325            .iter()
326            .map(|l| l.expression.as_str())
327            .collect();
328        let license_score = if licenses_a.is_empty() && licenses_b.is_empty() {
329            0.5 // Both empty = neutral
330        } else if licenses_a.is_empty() || licenses_b.is_empty() {
331            0.0 // One empty = no match
332        } else {
333            let intersection = licenses_a.intersection(&licenses_b).count();
334            let union = licenses_a.union(&licenses_b).count();
335            if union > 0 {
336                intersection as f64 / union as f64
337            } else {
338                0.0
339            }
340        };
341        result.license_score = license_score;
342        result.total += license_score * weights.licenses;
343
344        // 5. Supplier match (exact match on supplier organization name)
345        let supplier_score = match (&a.supplier, &b.supplier) {
346            (Some(sa), Some(sb)) if sa.name.to_lowercase() == sb.name.to_lowercase() => 1.0,
347            (None, None) => 0.5, // Both missing = neutral
348            _ => 0.0,
349        };
350        result.supplier_score = supplier_score;
351        result.total += supplier_score * weights.supplier;
352
353        // 6. Group/namespace match
354        let group_score = match (&a.group, &b.group) {
355            (Some(ga), Some(gb)) if ga.to_lowercase() == gb.to_lowercase() => 1.0,
356            (None, None) => 0.5, // Both missing = neutral
357            _ => 0.0,
358        };
359        result.group_score = group_score;
360        result.total += group_score * weights.group;
361
362        // Clamp total to [0.0, 1.0] after penalty application
363        result.total = result.total.clamp(0.0, 1.0);
364
365        result
366    }
367}
368
369impl Default for FuzzyMatcher {
370    fn default() -> Self {
371        Self::new(FuzzyMatchConfig::balanced())
372    }
373}
374
375impl ComponentMatcher for FuzzyMatcher {
376    fn match_score(&self, a: &Component, b: &Component) -> f64 {
377        self.match_components(a, b)
378    }
379
380    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
381        // Layer 1: Exact PURL match
382        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
383            let norm_a = self.purl_normalizer.normalize(purl_a);
384            let norm_b = self.purl_normalizer.normalize(purl_b);
385            if norm_a == norm_b {
386                return MatchResult::with_metadata(
387                    1.0,
388                    MatchTier::ExactIdentifier,
389                    MatchMetadata {
390                        matched_fields: vec!["purl".to_string()],
391                        normalization: Some("purl_normalized".to_string()),
392                        rule_id: None,
393                    },
394                );
395            }
396        }
397
398        // Layer 2: Alias table lookup
399        if self.check_alias_match(a, b) {
400            return MatchResult::with_metadata(
401                0.95,
402                MatchTier::Alias,
403                MatchMetadata {
404                    matched_fields: vec!["name".to_string()],
405                    normalization: Some("alias_table".to_string()),
406                    rule_id: None,
407                },
408            );
409        }
410
411        // Layer 3: Rule-based ecosystem normalization
412        if let Some(score) = self.check_ecosystem_rules(a, b)
413            && score >= 0.90 {
414                return MatchResult::with_metadata(
415                    score,
416                    MatchTier::EcosystemRule,
417                    MatchMetadata {
418                        matched_fields: vec!["name".to_string(), "ecosystem".to_string()],
419                        normalization: Some("ecosystem_rules".to_string()),
420                        rule_id: None,
421                    },
422                );
423            }
424
425        // Layer 4: Fuzzy string similarity
426        let fuzzy_score = self.compute_fuzzy_score(a, b);
427        if fuzzy_score >= self.config.threshold {
428            return MatchResult::with_metadata(
429                fuzzy_score,
430                MatchTier::Fuzzy,
431                MatchMetadata {
432                    matched_fields: vec!["name".to_string()],
433                    normalization: Some("fuzzy_similarity".to_string()),
434                    rule_id: None,
435                },
436            );
437        }
438
439        MatchResult::no_match()
440    }
441
442    fn name(&self) -> &'static str {
443        "FuzzyMatcher"
444    }
445
446    fn threshold(&self) -> f64 {
447        self.config.threshold
448    }
449
450    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
451        use strsim::{jaro_winkler, levenshtein};
452
453        // Layer 1: Exact PURL match
454        if let (Some(purl_a), Some(purl_b)) = (&a.identifiers.purl, &b.identifiers.purl) {
455            let norm_a = self.purl_normalizer.normalize(purl_a);
456            let norm_b = self.purl_normalizer.normalize(purl_b);
457            if norm_a == norm_b {
458                return MatchExplanation::matched(
459                    MatchTier::ExactIdentifier,
460                    1.0,
461                    format!(
462                        "Exact PURL match: '{purl_a}' equals '{purl_b}' after normalization"
463                    ),
464                )
465                .with_normalization("purl_normalized");
466            }
467        }
468
469        // Layer 2: Alias table lookup
470        if self.check_alias_match(a, b) {
471            return MatchExplanation::matched(
472                MatchTier::Alias,
473                0.95,
474                format!(
475                    "'{}' and '{}' are known aliases of the same package",
476                    a.name, b.name
477                ),
478            )
479            .with_normalization("alias_table");
480        }
481
482        // Layer 3: Rule-based ecosystem normalization
483        if let Some(score) = self.check_ecosystem_rules(a, b)
484            && score >= 0.90 {
485                let ecosystem = a
486                    .ecosystem
487                    .as_ref().map_or_else(|| "unknown".to_string(), std::string::ToString::to_string);
488                return MatchExplanation::matched(
489                    MatchTier::EcosystemRule,
490                    score,
491                    format!(
492                        "Names match after {} ecosystem normalization: '{}' -> '{}'",
493                        ecosystem, a.name, b.name
494                    ),
495                )
496                .with_normalization(format!("{ecosystem}_normalization"));
497            }
498
499        // Layer 4: Fuzzy string similarity - compute detailed breakdown
500        let name_a = a.name.to_lowercase();
501        let name_b = b.name.to_lowercase();
502
503        let jw_score = jaro_winkler(&name_a, &name_b);
504        let max_len = name_a.len().max(name_b.len());
505        let lev_distance = levenshtein(&name_a, &name_b);
506        let lev_score = if max_len > 0 {
507            1.0 - (lev_distance as f64 / max_len as f64)
508        } else {
509            1.0
510        };
511
512        let jw_weighted = jw_score * self.config.jaro_winkler_weight;
513        let lev_weighted = lev_score * self.config.levenshtein_weight;
514
515        let version_boost = if a.version == b.version && a.version.is_some() {
516            0.05
517        } else {
518            0.0
519        };
520
521        let combined = (jw_weighted + lev_weighted + version_boost).min(1.0);
522
523        let mut explanation = if combined >= self.config.threshold {
524            MatchExplanation::matched(
525                MatchTier::Fuzzy,
526                combined,
527                format!(
528                    "Fuzzy match: '{}' ~ '{}' with {:.0}% similarity",
529                    a.name,
530                    b.name,
531                    combined * 100.0
532                ),
533            )
534        } else {
535            MatchExplanation::no_match(format!(
536                "Fuzzy similarity {:.2} below threshold {:.2}",
537                combined, self.config.threshold
538            ))
539        };
540
541        // Add score breakdown
542        explanation = explanation
543            .with_score_component(ScoreComponent {
544                name: "Jaro-Winkler".to_string(),
545                weight: self.config.jaro_winkler_weight,
546                raw_score: jw_score,
547                weighted_score: jw_weighted,
548                description: format!("'{name_a}' vs '{name_b}' = {jw_score:.2}"),
549            })
550            .with_score_component(ScoreComponent {
551                name: "Levenshtein".to_string(),
552                weight: self.config.levenshtein_weight,
553                raw_score: lev_score,
554                weighted_score: lev_weighted,
555                description: format!(
556                    "edit distance {lev_distance} / max_len {max_len} = {lev_score:.2}"
557                ),
558            });
559
560        if version_boost > 0.0 {
561            explanation = explanation.with_score_component(ScoreComponent {
562                name: "Version boost".to_string(),
563                weight: 1.0,
564                raw_score: version_boost,
565                weighted_score: version_boost,
566                description: format!("versions match: {:?}", a.version),
567            });
568        }
569
570        explanation.with_normalization("lowercase")
571    }
572}
573
574#[cfg(test)]
575mod tests {
576    use super::*;
577
578    #[test]
579    fn test_exact_purl_match() {
580        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
581
582        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
583        a.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
584
585        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
586        b.identifiers.purl = Some("pkg:npm/lodash@4.17.21".to_string());
587
588        assert_eq!(matcher.match_components(&a, &b), 1.0);
589    }
590
591    #[test]
592    fn test_fuzzy_name_match() {
593        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
594
595        // Similar names should have some fuzzy match score
596        let a = Component::new("lodash-es".to_string(), "comp-1".to_string());
597        let b = Component::new("lodash".to_string(), "comp-2".to_string());
598
599        let score = matcher.match_components(&a, &b);
600        // With permissive threshold (0.70), similar names should match
601        assert!(
602            score >= 0.70,
603            "lodash-es vs lodash should have score >= 0.70, got {}",
604            score
605        );
606    }
607
608    #[test]
609    fn test_different_names_low_score() {
610        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::strict());
611
612        let a = Component::new("react".to_string(), "comp-1".to_string());
613        let b = Component::new("angular".to_string(), "comp-2".to_string());
614
615        let score = matcher.match_components(&a, &b);
616        assert!(
617            score < 0.5,
618            "react vs angular should have low score, got {}",
619            score
620        );
621    }
622
623    #[test]
624    fn test_multi_field_weights_normalized() {
625        let weights = config::MultiFieldWeights::balanced();
626        assert!(
627            weights.is_normalized(),
628            "Balanced weights should be normalized"
629        );
630
631        let weights = config::MultiFieldWeights::name_focused();
632        assert!(
633            weights.is_normalized(),
634            "Name-focused weights should be normalized"
635        );
636
637        let weights = config::MultiFieldWeights::security_focused();
638        assert!(
639            weights.is_normalized(),
640            "Security-focused weights should be normalized"
641        );
642    }
643
644    #[test]
645    fn test_multi_field_scoring_same_component() {
646        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
647        let weights = config::MultiFieldWeights::balanced();
648
649        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
650        a.version = Some("4.17.21".to_string());
651        a.ecosystem = Some(crate::model::Ecosystem::Npm);
652
653        // Identical component should score very high
654        // Note: empty licenses/supplier/group get neutral 0.5 score, so total won't be 1.0
655        let result = matcher.compute_multi_field_score(&a, &a, &weights);
656        assert!(
657            result.total > 0.90,
658            "Same component should score > 0.90, got {}",
659            result.total
660        );
661        assert_eq!(result.name_score, 1.0);
662        assert_eq!(result.version_score, 1.0);
663        assert_eq!(result.ecosystem_score, 1.0);
664        // Empty fields get neutral 0.5 score
665        assert_eq!(
666            result.license_score, 0.5,
667            "Empty licenses should be neutral"
668        );
669        assert_eq!(
670            result.supplier_score, 0.5,
671            "Empty supplier should be neutral"
672        );
673        assert_eq!(result.group_score, 0.5, "Empty group should be neutral");
674    }
675
676    #[test]
677    fn test_multi_field_scoring_different_versions() {
678        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
679        let weights = config::MultiFieldWeights::balanced();
680
681        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
682        a.version = Some("4.17.21".to_string());
683        a.ecosystem = Some(crate::model::Ecosystem::Npm);
684
685        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
686        b.version = Some("4.17.20".to_string()); // Different patch version
687        b.ecosystem = Some(crate::model::Ecosystem::Npm);
688
689        let result = matcher.compute_multi_field_score(&a, &b, &weights);
690
691        // Name matches perfectly
692        assert!(result.name_score > 0.9, "Name score should be > 0.9");
693
694        // Graduated version scoring: same major.minor gives high score
695        // 4.17.21 vs 4.17.20 = same major.minor, patch diff of 1
696        // Expected: 0.8 - 0.01 * 1 = 0.79
697        assert!(
698            result.version_score > 0.7,
699            "Same major.minor with patch diff should score high, got {}",
700            result.version_score
701        );
702
703        // Ecosystem matches
704        assert_eq!(
705            result.ecosystem_score, 1.0,
706            "Same ecosystem should score 1.0"
707        );
708
709        // Total should be high due to name, ecosystem, and graduated version score
710        assert!(
711            result.total > 0.8,
712            "Total should be > 0.8, got {}",
713            result.total
714        );
715    }
716
717    #[test]
718    fn test_multi_field_scoring_different_major_versions() {
719        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
720        let weights = config::MultiFieldWeights::balanced();
721
722        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
723        a.version = Some("4.17.21".to_string());
724        a.ecosystem = Some(crate::model::Ecosystem::Npm);
725
726        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
727        b.version = Some("3.10.0".to_string()); // Different major version
728        b.ecosystem = Some(crate::model::Ecosystem::Npm);
729
730        let result = matcher.compute_multi_field_score(&a, &b, &weights);
731
732        // Graduated version scoring: different major gives low score
733        // 4 vs 3 = major diff of 1
734        // Expected: 0.3 - 0.10 * 1 = 0.20
735        assert!(
736            result.version_score < 0.3,
737            "Different major versions should score low, got {}",
738            result.version_score
739        );
740    }
741
742    #[test]
743    fn test_multi_field_scoring_legacy_weights() {
744        // Test that legacy weights disable graduated scoring
745        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced_multi_field());
746        let weights = config::MultiFieldWeights::legacy();
747
748        let mut a = Component::new("lodash".to_string(), "comp-1".to_string());
749        a.version = Some("4.17.21".to_string());
750        a.ecosystem = Some(crate::model::Ecosystem::Npm);
751
752        let mut b = Component::new("lodash".to_string(), "comp-2".to_string());
753        b.version = Some("4.17.20".to_string());
754        b.ecosystem = Some(crate::model::Ecosystem::Npm);
755
756        let result = matcher.compute_multi_field_score(&a, &b, &weights);
757
758        // Legacy mode: binary version scoring (exact match or 0)
759        assert_eq!(
760            result.version_score, 0.0,
761            "Legacy mode: different versions should score 0"
762        );
763    }
764
765    #[test]
766    fn test_multi_field_config_preset() {
767        let config = FuzzyMatchConfig::from_preset("balanced-multi").unwrap();
768        assert!(config.field_weights.is_some());
769
770        let config = FuzzyMatchConfig::from_preset("strict_multi").unwrap();
771        assert!(config.field_weights.is_some());
772    }
773
774    #[test]
775    fn test_multi_field_score_result_summary() {
776        let result = MultiFieldScoreResult {
777            total: 0.85,
778            name_score: 1.0,
779            version_score: 0.0,
780            ecosystem_score: 1.0,
781            license_score: 0.5,
782            supplier_score: 0.5,
783            group_score: 0.5,
784        };
785
786        let summary = result.summary();
787        assert!(summary.contains("0.85"));
788        assert!(summary.contains("name: 1.00"));
789    }
790
791    #[test]
792    fn test_token_similarity_exact() {
793        let score = string_similarity::compute_token_similarity("react-dom", "react-dom");
794        assert_eq!(score, 1.0);
795    }
796
797    #[test]
798    fn test_token_similarity_reordered() {
799        // Reordered tokens should have high similarity
800        let score = string_similarity::compute_token_similarity("react-dom", "dom-react");
801        assert_eq!(score, 1.0, "Reordered tokens should match perfectly");
802    }
803
804    #[test]
805    fn test_token_similarity_partial() {
806        // Partial token overlap
807        let score = string_similarity::compute_token_similarity("react-dom-utils", "react-dom");
808        // Jaccard: 2 common / 3 total = 0.667
809        assert!(
810            (score - 0.667).abs() < 0.01,
811            "Partial overlap should be ~0.67, got {}",
812            score
813        );
814    }
815
816    #[test]
817    fn test_token_similarity_different_delimiters() {
818        // Different delimiters should still work
819        let score = string_similarity::compute_token_similarity("my_package_name", "my-package-name");
820        assert_eq!(score, 1.0, "Different delimiters should match");
821    }
822
823    #[test]
824    fn test_token_similarity_no_overlap() {
825        let score = string_similarity::compute_token_similarity("react", "angular");
826        assert_eq!(score, 0.0, "No common tokens should score 0");
827    }
828
829    #[test]
830    fn test_version_similarity_exact() {
831        let v1 = "1.2.3".to_string();
832        let v2 = "1.2.3".to_string();
833        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
834        assert_eq!(score, 0.10, "Exact version match should give max boost");
835    }
836
837    #[test]
838    fn test_version_similarity_same_major_minor() {
839        let v1 = "1.2.3".to_string();
840        let v2 = "1.2.4".to_string();
841        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
842        assert_eq!(score, 0.07, "Same major.minor should give 0.07 boost");
843    }
844
845    #[test]
846    fn test_version_similarity_same_major() {
847        let v1 = "1.2.3".to_string();
848        let v2 = "1.5.0".to_string();
849        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
850        assert_eq!(score, 0.04, "Same major should give 0.04 boost");
851    }
852
853    #[test]
854    fn test_version_similarity_different_major() {
855        let v1 = "1.2.3".to_string();
856        let v2 = "2.0.0".to_string();
857        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
858        assert_eq!(score, 0.0, "Different major versions should give no boost");
859    }
860
861    #[test]
862    fn test_version_similarity_prerelease() {
863        // Handle prerelease versions like "1.2.3-beta"
864        let v1 = "1.2.3-beta".to_string();
865        let v2 = "1.2.4".to_string();
866        let score = FuzzyMatcher::compute_version_similarity(Some(&v1), Some(&v2));
867        assert_eq!(score, 0.07, "Prerelease should still match major.minor");
868    }
869
870    #[test]
871    fn test_version_similarity_missing() {
872        let v = "1.0.0".to_string();
873        let score = FuzzyMatcher::compute_version_similarity(None, Some(&v));
874        assert_eq!(score, 0.0, "Missing version should give no boost");
875
876        let score = FuzzyMatcher::compute_version_similarity(None, None);
877        assert_eq!(score, 0.0, "Both missing should give no boost");
878    }
879
880    #[test]
881    fn test_fuzzy_match_with_reordered_tokens() {
882        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
883
884        let a = Component::new("react-dom".to_string(), "comp-1".to_string());
885        let b = Component::new("dom-react".to_string(), "comp-2".to_string());
886
887        let score = matcher.match_components(&a, &b);
888        // Token similarity is 1.0, blended with character similarity
889        assert!(
890            score > 0.5,
891            "Reordered names should still match, got {}",
892            score
893        );
894    }
895
896    #[test]
897    fn test_fuzzy_match_version_boost() {
898        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
899
900        // Use slightly different names so we rely on fuzzy matching, not exact match
901        let mut a = Component::new("lodash-utils".to_string(), "comp-1".to_string());
902        a.version = Some("4.17.21".to_string());
903
904        let mut b = Component::new("lodash-util".to_string(), "comp-2".to_string());
905        b.version = Some("4.17.20".to_string()); // Same major.minor -> +0.07 boost
906
907        let mut c = Component::new("lodash-util".to_string(), "comp-3".to_string());
908        c.version = Some("5.0.0".to_string()); // Different major -> +0.0 boost
909
910        let score_same_minor = matcher.match_components(&a, &b);
911        let score_diff_major = matcher.match_components(&a, &c);
912
913        // Both should match (fuzzy), but same_minor should have version boost
914        assert!(score_same_minor > 0.0, "Same minor should match");
915        assert!(score_diff_major > 0.0, "Different major should still match");
916        assert!(
917            score_same_minor > score_diff_major,
918            "Same minor version should score higher: {} vs {}",
919            score_same_minor,
920            score_diff_major
921        );
922    }
923
924    #[test]
925    fn test_soundex_basic() {
926        // Test basic Soundex encoding
927        assert_eq!(string_similarity::soundex("Robert"), "R163");
928        assert_eq!(string_similarity::soundex("Rupert"), "R163"); // Same as Robert
929        assert_eq!(string_similarity::soundex("Smith"), "S530");
930        assert_eq!(string_similarity::soundex("Smyth"), "S530"); // Same as Smith
931    }
932
933    #[test]
934    fn test_soundex_empty() {
935        assert_eq!(string_similarity::soundex(""), "");
936        assert_eq!(string_similarity::soundex("123"), ""); // No letters
937    }
938
939    #[test]
940    fn test_phonetic_similarity_exact() {
941        let score = string_similarity::compute_phonetic_similarity("color", "colour");
942        assert_eq!(score, 1.0, "color and colour should match phonetically");
943    }
944
945    #[test]
946    fn test_phonetic_similarity_different() {
947        let score = string_similarity::compute_phonetic_similarity("react", "angular");
948        assert!(
949            score < 0.5,
950            "Different names should have low phonetic similarity"
951        );
952    }
953
954    #[test]
955    fn test_phonetic_similarity_compound() {
956        // Test compound names where tokens match phonetically
957        let score = string_similarity::compute_phonetic_similarity("json-parser", "jayson-parser");
958        assert!(
959            score > 0.5,
960            "Similar sounding compound names should match: {}",
961            score
962        );
963    }
964
965    #[test]
966    fn test_fuzzy_match_with_phonetic() {
967        let matcher = FuzzyMatcher::new(FuzzyMatchConfig::permissive());
968
969        let a = Component::new("color-utils".to_string(), "comp-1".to_string());
970        let b = Component::new("colour-utils".to_string(), "comp-2".to_string());
971
972        let score = matcher.match_components(&a, &b);
973        assert!(
974            score > 0.7,
975            "Phonetically similar names should match: {}",
976            score
977        );
978    }
979}
sbom_tools/matching/mod.rs

sbom_tools/matching/
mod.rs