Skip to main content

sbom_tools/matching/
traits.rs

1//! Trait definitions for component matching strategies.
2//!
3//! This module provides abstractions for component matching, enabling
4//! pluggable matching strategies and easier testing.
5
6use crate::model::Component;
7
8/// Result of matching two components.
9#[derive(Debug, Clone)]
10pub struct MatchResult {
11    /// The matching confidence score (0.0 - 1.0)
12    pub score: f64,
13    /// The matching tier that produced this result
14    pub tier: MatchTier,
15    /// Additional metadata about the match
16    pub metadata: MatchMetadata,
17}
18
19impl MatchResult {
20    /// Create a new match result
21    pub fn new(score: f64, tier: MatchTier) -> Self {
22        Self {
23            score,
24            tier,
25            metadata: MatchMetadata::default(),
26        }
27    }
28
29    /// Create a match result with metadata
30    pub fn with_metadata(score: f64, tier: MatchTier, metadata: MatchMetadata) -> Self {
31        Self {
32            score,
33            tier,
34            metadata,
35        }
36    }
37
38    /// Create a no-match result
39    pub fn no_match() -> Self {
40        Self {
41            score: 0.0,
42            tier: MatchTier::None,
43            metadata: MatchMetadata::default(),
44        }
45    }
46
47    /// Check if this represents a successful match
48    pub fn is_match(&self) -> bool {
49        self.score > 0.0 && self.tier != MatchTier::None
50    }
51}
52
53/// The tier/level at which a match was found.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
55pub enum MatchTier {
56    /// No match found
57    None,
58    /// Exact identifier match (PURL, CPE, etc.)
59    ExactIdentifier,
60    /// Match via alias table
61    Alias,
62    /// Match via ecosystem-specific rules
63    EcosystemRule,
64    /// Match via fuzzy string similarity
65    Fuzzy,
66    /// Match via custom user rules
67    CustomRule,
68}
69
70impl MatchTier {
71    /// Get the default confidence score for this tier
72    pub fn default_score(&self) -> f64 {
73        match self {
74            MatchTier::None => 0.0,
75            MatchTier::ExactIdentifier => 1.0,
76            MatchTier::Alias => 0.95,
77            MatchTier::EcosystemRule => 0.90,
78            MatchTier::CustomRule => 0.92,
79            MatchTier::Fuzzy => 0.80,
80        }
81    }
82}
83
84/// Additional metadata about a match.
85#[derive(Debug, Clone, Default)]
86pub struct MatchMetadata {
87    /// The field(s) that matched
88    pub matched_fields: Vec<String>,
89    /// The normalization applied, if any
90    pub normalization: Option<String>,
91    /// The rule that produced the match, if applicable
92    pub rule_id: Option<String>,
93}
94
95/// Human-readable explanation of why two components matched (or didn't).
96///
97/// Useful for debugging match decisions and auditing SBOM diff results.
98#[derive(Debug, Clone)]
99pub struct MatchExplanation {
100    /// The matching tier that produced this result
101    pub tier: MatchTier,
102    /// The final confidence score
103    pub score: f64,
104    /// Human-readable reason for the match/non-match
105    pub reason: String,
106    /// Detailed breakdown of score components
107    pub score_breakdown: Vec<ScoreComponent>,
108    /// Normalizations that were applied
109    pub normalizations_applied: Vec<String>,
110    /// Whether this was a successful match
111    pub is_match: bool,
112}
113
114/// A component of the overall match score.
115#[derive(Debug, Clone)]
116pub struct ScoreComponent {
117    /// Name of this score component
118    pub name: String,
119    /// Weight applied to this component
120    pub weight: f64,
121    /// Raw score before weighting
122    pub raw_score: f64,
123    /// Weighted contribution to final score
124    pub weighted_score: f64,
125    /// Description of what was compared
126    pub description: String,
127}
128
129impl MatchExplanation {
130    /// Create an explanation for a successful match.
131    pub fn matched(tier: MatchTier, score: f64, reason: impl Into<String>) -> Self {
132        Self {
133            tier,
134            score,
135            reason: reason.into(),
136            score_breakdown: Vec::new(),
137            normalizations_applied: Vec::new(),
138            is_match: true,
139        }
140    }
141
142    /// Create an explanation for a failed match.
143    pub fn no_match(reason: impl Into<String>) -> Self {
144        Self {
145            tier: MatchTier::None,
146            score: 0.0,
147            reason: reason.into(),
148            score_breakdown: Vec::new(),
149            normalizations_applied: Vec::new(),
150            is_match: false,
151        }
152    }
153
154    /// Add a score component to the breakdown.
155    pub fn with_score_component(mut self, component: ScoreComponent) -> Self {
156        self.score_breakdown.push(component);
157        self
158    }
159
160    /// Add a normalization that was applied.
161    pub fn with_normalization(mut self, normalization: impl Into<String>) -> Self {
162        self.normalizations_applied.push(normalization.into());
163        self
164    }
165
166    /// Generate a human-readable summary of the match.
167    pub fn summary(&self) -> String {
168        if self.is_match {
169            format!(
170                "MATCH ({:.0}% confidence via {:?}): {}",
171                self.score * 100.0,
172                self.tier,
173                self.reason
174            )
175        } else {
176            format!("NO MATCH: {}", self.reason)
177        }
178    }
179
180    /// Generate a detailed multi-line explanation.
181    pub fn detailed(&self) -> String {
182        let mut lines = vec![self.summary()];
183
184        if !self.score_breakdown.is_empty() {
185            lines.push("Score breakdown:".to_string());
186            for component in &self.score_breakdown {
187                lines.push(format!(
188                    "  - {}: {:.2} × {:.2} = {:.2} ({})",
189                    component.name,
190                    component.raw_score,
191                    component.weight,
192                    component.weighted_score,
193                    component.description
194                ));
195            }
196        }
197
198        if !self.normalizations_applied.is_empty() {
199            lines.push(format!(
200                "Normalizations: {}",
201                self.normalizations_applied.join(", ")
202            ));
203        }
204
205        lines.join("\n")
206    }
207}
208
209impl std::fmt::Display for MatchExplanation {
210    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
211        write!(f, "{}", self.summary())
212    }
213}
214
215/// Trait for component matching strategies.
216///
217/// Implementors provide different strategies for determining if two
218/// components represent the same logical package across SBOMs.
219///
220/// # Example
221///
222/// ```ignore
223/// use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
224///
225/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
226/// let score = matcher.match_score(&component_a, &component_b);
227/// ```
228pub trait ComponentMatcher: Send + Sync {
229    /// Compute a match score between two components.
230    ///
231    /// Returns a score between 0.0 (no match) and 1.0 (perfect match).
232    fn match_score(&self, a: &Component, b: &Component) -> f64;
233
234    /// Compute a detailed match result between two components.
235    ///
236    /// Returns a `MatchResult` with score, tier, and metadata.
237    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
238        let score = self.match_score(a, b);
239        if score > 0.0 {
240            MatchResult::new(score, MatchTier::Fuzzy)
241        } else {
242            MatchResult::no_match()
243        }
244    }
245
246    /// Generate a human-readable explanation of why two components matched or didn't.
247    ///
248    /// Useful for debugging and auditing match decisions.
249    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
250        let result = self.match_detailed(a, b);
251        if result.is_match() {
252            MatchExplanation::matched(
253                result.tier,
254                result.score,
255                format!("'{}' matches '{}' via {:?}", a.name, b.name, result.tier),
256            )
257        } else {
258            MatchExplanation::no_match(format!(
259                "'{}' does not match '{}' (score {:.2} below threshold)",
260                a.name, b.name, result.score
261            ))
262        }
263    }
264
265    /// Find the best matching component from a list of candidates.
266    ///
267    /// Returns the best match and its score, or None if no match meets the threshold.
268    fn find_best_match<'a>(
269        &self,
270        target: &Component,
271        candidates: &'a [&Component],
272        threshold: f64,
273    ) -> Option<(&'a Component, f64)> {
274        candidates
275            .iter()
276            .map(|c| (*c, self.match_score(target, c)))
277            .filter(|(_, score)| *score >= threshold)
278            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
279    }
280
281    /// Get the name of this matcher for logging/debugging.
282    fn name(&self) -> &str {
283        "ComponentMatcher"
284    }
285
286    /// Get the minimum threshold this matcher uses for fuzzy matching.
287    fn threshold(&self) -> f64 {
288        0.0
289    }
290}
291
292/// Configuration for the cached matcher.
293#[derive(Debug, Clone)]
294pub struct CacheConfig {
295    /// Maximum number of entries in the cache.
296    pub max_entries: usize,
297    /// Whether to cache detailed results (more memory).
298    pub cache_detailed: bool,
299}
300
301impl Default for CacheConfig {
302    fn default() -> Self {
303        Self {
304            max_entries: 100_000,
305            cache_detailed: false,
306        }
307    }
308}
309
310impl CacheConfig {
311    /// Create a config optimized for small SBOMs.
312    pub fn small() -> Self {
313        Self {
314            max_entries: 10_000,
315            cache_detailed: true,
316        }
317    }
318
319    /// Create a config optimized for large SBOMs.
320    pub fn large() -> Self {
321        Self {
322            max_entries: 500_000,
323            cache_detailed: false,
324        }
325    }
326}
327
328/// Cache key combining component IDs.
329#[derive(Hash, Eq, PartialEq, Clone)]
330struct CacheKey {
331    hash: u64,
332}
333
334impl CacheKey {
335    fn new(a_id: &str, b_id: &str) -> Self {
336        use xxhash_rust::xxh3::xxh3_64;
337
338        // Create a combined key - order-independent for symmetry
339        let (first, second) = if a_id < b_id {
340            (a_id, b_id)
341        } else {
342            (b_id, a_id)
343        };
344
345        let combined = format!("{}|{}", first, second);
346        Self {
347            hash: xxh3_64(combined.as_bytes()),
348        }
349    }
350}
351
352/// Cached match result entry.
353#[derive(Clone)]
354struct CacheEntry {
355    score: f64,
356    detailed: Option<MatchResult>,
357}
358
359/// A wrapper that caches match results for performance.
360///
361/// The cache uses component IDs to generate cache keys and stores
362/// match scores for quick lookup. This is particularly effective when
363/// the same component pairs are compared multiple times.
364///
365/// # Example
366///
367/// ```ignore
368/// use sbom_tools::matching::{CachedMatcher, FuzzyMatcher, FuzzyMatchConfig, CacheConfig};
369///
370/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
371/// let cached = CachedMatcher::new(matcher);
372/// // Or with custom config:
373/// let cached = CachedMatcher::with_config(matcher, CacheConfig::large());
374/// ```
375pub struct CachedMatcher<M: ComponentMatcher> {
376    inner: M,
377    config: CacheConfig,
378    cache: std::sync::RwLock<std::collections::HashMap<CacheKey, CacheEntry>>,
379    stats: std::sync::atomic::AtomicUsize,
380    hits: std::sync::atomic::AtomicUsize,
381}
382
383impl<M: ComponentMatcher> CachedMatcher<M> {
384    /// Create a new cached matcher wrapping the given matcher.
385    pub fn new(inner: M) -> Self {
386        Self::with_config(inner, CacheConfig::default())
387    }
388
389    /// Create a cached matcher with custom configuration.
390    pub fn with_config(inner: M, config: CacheConfig) -> Self {
391        Self {
392            inner,
393            config,
394            cache: std::sync::RwLock::new(std::collections::HashMap::new()),
395            stats: std::sync::atomic::AtomicUsize::new(0),
396            hits: std::sync::atomic::AtomicUsize::new(0),
397        }
398    }
399
400    /// Get a reference to the inner matcher.
401    pub fn inner(&self) -> &M {
402        &self.inner
403    }
404
405    /// Get cache statistics.
406    pub fn cache_stats(&self) -> CacheStats {
407        let total = self.stats.load(std::sync::atomic::Ordering::Relaxed);
408        let hits = self.hits.load(std::sync::atomic::Ordering::Relaxed);
409        let size = self.cache.read().map(|c| c.len()).unwrap_or(0);
410        CacheStats {
411            total_lookups: total,
412            cache_hits: hits,
413            cache_misses: total.saturating_sub(hits),
414            hit_rate: if total > 0 {
415                hits as f64 / total as f64
416            } else {
417                0.0
418            },
419            cache_size: size,
420        }
421    }
422
423    /// Clear the cache.
424    pub fn clear_cache(&self) {
425        if let Ok(mut cache) = self.cache.write() {
426            cache.clear();
427        }
428        self.stats.store(0, std::sync::atomic::Ordering::Relaxed);
429        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
430    }
431
432    /// Try to get a cached score.
433    fn get_cached(&self, key: &CacheKey) -> Option<CacheEntry> {
434        self.stats
435            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
436        if let Ok(cache) = self.cache.read() {
437            if let Some(entry) = cache.get(key) {
438                self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
439                return Some(entry.clone());
440            }
441        }
442        None
443    }
444
445    /// Store a result in the cache.
446    fn store_cached(&self, key: CacheKey, entry: CacheEntry) {
447        if let Ok(mut cache) = self.cache.write() {
448            // Simple eviction: clear half the cache when full
449            if cache.len() >= self.config.max_entries {
450                let to_remove: Vec<CacheKey> = cache
451                    .keys()
452                    .take(self.config.max_entries / 2)
453                    .cloned()
454                    .collect();
455                for k in to_remove {
456                    cache.remove(&k);
457                }
458            }
459            cache.insert(key, entry);
460        }
461    }
462}
463
464/// Cache statistics.
465#[derive(Debug, Clone)]
466pub struct CacheStats {
467    /// Total number of cache lookups.
468    pub total_lookups: usize,
469    /// Number of cache hits.
470    pub cache_hits: usize,
471    /// Number of cache misses.
472    pub cache_misses: usize,
473    /// Hit rate (0.0 - 1.0).
474    pub hit_rate: f64,
475    /// Current cache size.
476    pub cache_size: usize,
477}
478
479impl<M: ComponentMatcher> ComponentMatcher for CachedMatcher<M> {
480    fn match_score(&self, a: &Component, b: &Component) -> f64 {
481        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
482
483        // Check cache first
484        if let Some(entry) = self.get_cached(&key) {
485            return entry.score;
486        }
487
488        // Compute and cache
489        let score = self.inner.match_score(a, b);
490        self.store_cached(
491            key,
492            CacheEntry {
493                score,
494                detailed: None,
495            },
496        );
497        score
498    }
499
500    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
501        if !self.config.cache_detailed {
502            return self.inner.match_detailed(a, b);
503        }
504
505        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
506
507        // Check cache for detailed result
508        if let Some(entry) = self.get_cached(&key) {
509            if let Some(detailed) = entry.detailed {
510                return detailed;
511            }
512        }
513
514        // Compute and cache
515        let result = self.inner.match_detailed(a, b);
516        self.store_cached(
517            key,
518            CacheEntry {
519                score: result.score,
520                detailed: Some(result.clone()),
521            },
522        );
523        result
524    }
525
526    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
527        // Don't cache explanations as they're typically for debugging
528        self.inner.explain_match(a, b)
529    }
530
531    fn name(&self) -> &str {
532        "CachedMatcher"
533    }
534
535    fn threshold(&self) -> f64 {
536        self.inner.threshold()
537    }
538}
539
540/// A composite matcher that tries multiple strategies in order.
541pub struct CompositeMatcherBuilder {
542    matchers: Vec<Box<dyn ComponentMatcher>>,
543}
544
545impl CompositeMatcherBuilder {
546    /// Create a new composite matcher builder.
547    pub fn new() -> Self {
548        Self {
549            matchers: Vec::new(),
550        }
551    }
552
553    /// Add a matcher to the composite.
554    pub fn with_matcher(mut self, matcher: Box<dyn ComponentMatcher>) -> Self {
555        self.matchers.push(matcher);
556        self
557    }
558
559    /// Build the composite matcher.
560    pub fn build(self) -> CompositeMatcher {
561        CompositeMatcher {
562            matchers: self.matchers,
563        }
564    }
565}
566
567impl Default for CompositeMatcherBuilder {
568    fn default() -> Self {
569        Self::new()
570    }
571}
572
573/// A matcher that combines multiple matching strategies.
574pub struct CompositeMatcher {
575    matchers: Vec<Box<dyn ComponentMatcher>>,
576}
577
578impl ComponentMatcher for CompositeMatcher {
579    fn match_score(&self, a: &Component, b: &Component) -> f64 {
580        // Return the highest score from any matcher
581        self.matchers
582            .iter()
583            .map(|m| m.match_score(a, b))
584            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
585            .unwrap_or(0.0)
586    }
587
588    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
589        // Return the best result from any matcher
590        self.matchers
591            .iter()
592            .map(|m| m.match_detailed(a, b))
593            .max_by(|a, b| {
594                a.score
595                    .partial_cmp(&b.score)
596                    .unwrap_or(std::cmp::Ordering::Equal)
597            })
598            .unwrap_or_else(MatchResult::no_match)
599    }
600
601    fn name(&self) -> &str {
602        "CompositeMatcher"
603    }
604}
605
606#[cfg(test)]
607mod tests {
608    use super::*;
609
610    /// A simple test matcher that always returns a fixed score
611    struct FixedScoreMatcher(f64);
612
613    impl ComponentMatcher for FixedScoreMatcher {
614        fn match_score(&self, _a: &Component, _b: &Component) -> f64 {
615            self.0
616        }
617
618        fn name(&self) -> &str {
619            "FixedScoreMatcher"
620        }
621    }
622
623    #[test]
624    fn test_match_result_creation() {
625        let result = MatchResult::new(0.95, MatchTier::Alias);
626        assert_eq!(result.score, 0.95);
627        assert_eq!(result.tier, MatchTier::Alias);
628        assert!(result.is_match());
629    }
630
631    #[test]
632    fn test_no_match_result() {
633        let result = MatchResult::no_match();
634        assert_eq!(result.score, 0.0);
635        assert_eq!(result.tier, MatchTier::None);
636        assert!(!result.is_match());
637    }
638
639    #[test]
640    fn test_match_tier_default_scores() {
641        assert_eq!(MatchTier::ExactIdentifier.default_score(), 1.0);
642        assert_eq!(MatchTier::Alias.default_score(), 0.95);
643        assert_eq!(MatchTier::EcosystemRule.default_score(), 0.90);
644        assert_eq!(MatchTier::None.default_score(), 0.0);
645    }
646
647    #[test]
648    fn test_composite_matcher() {
649        let matcher = CompositeMatcherBuilder::new()
650            .with_matcher(Box::new(FixedScoreMatcher(0.5)))
651            .with_matcher(Box::new(FixedScoreMatcher(0.8)))
652            .with_matcher(Box::new(FixedScoreMatcher(0.3)))
653            .build();
654
655        let comp_a = Component::new("test".to_string(), "id-1".to_string());
656        let comp_b = Component::new("test".to_string(), "id-2".to_string());
657
658        // Should return the highest score (0.8)
659        assert_eq!(matcher.match_score(&comp_a, &comp_b), 0.8);
660    }
661
662    #[test]
663    fn test_find_best_match() {
664        let matcher = FixedScoreMatcher(0.85);
665        let target = Component::new("target".to_string(), "id-0".to_string());
666        let candidates: Vec<Component> = vec![
667            Component::new("candidate1".to_string(), "id-1".to_string()),
668            Component::new("candidate2".to_string(), "id-2".to_string()),
669        ];
670        let candidate_refs: Vec<&Component> = candidates.iter().collect();
671
672        // With threshold 0.8, should find a match
673        let result = matcher.find_best_match(&target, &candidate_refs, 0.8);
674        assert!(result.is_some());
675
676        // With threshold 0.9, should not find a match
677        let result = matcher.find_best_match(&target, &candidate_refs, 0.9);
678        assert!(result.is_none());
679    }
680
681    #[test]
682    fn test_match_explanation_matched() {
683        let explanation =
684            MatchExplanation::matched(MatchTier::ExactIdentifier, 1.0, "Test match reason");
685
686        assert!(explanation.is_match);
687        assert_eq!(explanation.score, 1.0);
688        assert_eq!(explanation.tier, MatchTier::ExactIdentifier);
689        assert!(explanation.summary().contains("MATCH"));
690        assert!(explanation.summary().contains("100%"));
691    }
692
693    #[test]
694    fn test_match_explanation_no_match() {
695        let explanation = MatchExplanation::no_match("Components are too different");
696
697        assert!(!explanation.is_match);
698        assert_eq!(explanation.score, 0.0);
699        assert_eq!(explanation.tier, MatchTier::None);
700        assert!(explanation.summary().contains("NO MATCH"));
701    }
702
703    #[test]
704    fn test_match_explanation_with_breakdown() {
705        let explanation = MatchExplanation::matched(MatchTier::Fuzzy, 0.85, "Fuzzy match")
706            .with_score_component(ScoreComponent {
707                name: "Jaro-Winkler".to_string(),
708                weight: 0.7,
709                raw_score: 0.9,
710                weighted_score: 0.63,
711                description: "name similarity".to_string(),
712            })
713            .with_score_component(ScoreComponent {
714                name: "Levenshtein".to_string(),
715                weight: 0.3,
716                raw_score: 0.73,
717                weighted_score: 0.22,
718                description: "edit distance".to_string(),
719            })
720            .with_normalization("lowercase");
721
722        assert_eq!(explanation.score_breakdown.len(), 2);
723        assert_eq!(explanation.normalizations_applied.len(), 1);
724
725        let detailed = explanation.detailed();
726        assert!(detailed.contains("Score breakdown:"));
727        assert!(detailed.contains("Jaro-Winkler"));
728        assert!(detailed.contains("Normalizations: lowercase"));
729    }
730
731    #[test]
732    fn test_match_explanation_display() {
733        let explanation = MatchExplanation::matched(MatchTier::Alias, 0.95, "Known alias");
734        let display = format!("{}", explanation);
735        assert!(display.contains("MATCH"));
736        assert!(display.contains("95%"));
737    }
738}