Skip to main content

sbom_tools/matching/
traits.rs

1//! Trait definitions for component matching strategies.
2//!
3//! This module provides abstractions for component matching, enabling
4//! pluggable matching strategies and easier testing.
5
6use crate::model::Component;
7
8/// Result of matching two components.
9#[derive(Debug, Clone)]
10#[must_use]
11pub struct MatchResult {
12    /// The matching confidence score (0.0 - 1.0)
13    pub score: f64,
14    /// The matching tier that produced this result
15    pub tier: MatchTier,
16    /// Additional metadata about the match
17    pub metadata: MatchMetadata,
18}
19
20impl MatchResult {
21    /// Create a new match result
22    pub fn new(score: f64, tier: MatchTier) -> Self {
23        Self {
24            score,
25            tier,
26            metadata: MatchMetadata::default(),
27        }
28    }
29
30    /// Create a match result with metadata
31    pub const fn with_metadata(score: f64, tier: MatchTier, metadata: MatchMetadata) -> Self {
32        Self {
33            score,
34            tier,
35            metadata,
36        }
37    }
38
39    /// Create a no-match result
40    pub fn no_match() -> Self {
41        Self {
42            score: 0.0,
43            tier: MatchTier::None,
44            metadata: MatchMetadata::default(),
45        }
46    }
47
48    /// Check if this represents a successful match
49    #[must_use] 
50    pub fn is_match(&self) -> bool {
51        self.score > 0.0 && self.tier != MatchTier::None
52    }
53}
54
55/// The tier/level at which a match was found.
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
57#[non_exhaustive]
58pub enum MatchTier {
59    /// No match found
60    None,
61    /// Exact identifier match (PURL, CPE, etc.)
62    ExactIdentifier,
63    /// Match via alias table
64    Alias,
65    /// Match via ecosystem-specific rules
66    EcosystemRule,
67    /// Match via fuzzy string similarity
68    Fuzzy,
69    /// Match via custom user rules
70    CustomRule,
71}
72
73impl MatchTier {
74    /// Get the default confidence score for this tier
75    #[must_use] 
76    pub const fn default_score(&self) -> f64 {
77        match self {
78            Self::None => 0.0,
79            Self::ExactIdentifier => 1.0,
80            Self::Alias => 0.95,
81            Self::EcosystemRule => 0.90,
82            Self::CustomRule => 0.92,
83            Self::Fuzzy => 0.80,
84        }
85    }
86}
87
88/// Additional metadata about a match.
89#[derive(Debug, Clone, Default)]
90pub struct MatchMetadata {
91    /// The field(s) that matched
92    pub matched_fields: Vec<String>,
93    /// The normalization applied, if any
94    pub normalization: Option<String>,
95    /// The rule that produced the match, if applicable
96    pub rule_id: Option<String>,
97}
98
99/// Human-readable explanation of why two components matched (or didn't).
100///
101/// Useful for debugging match decisions and auditing SBOM diff results.
102#[derive(Debug, Clone)]
103pub struct MatchExplanation {
104    /// The matching tier that produced this result
105    pub tier: MatchTier,
106    /// The final confidence score
107    pub score: f64,
108    /// Human-readable reason for the match/non-match
109    pub reason: String,
110    /// Detailed breakdown of score components
111    pub score_breakdown: Vec<ScoreComponent>,
112    /// Normalizations that were applied
113    pub normalizations_applied: Vec<String>,
114    /// Whether this was a successful match
115    pub is_match: bool,
116}
117
118/// A component of the overall match score.
119#[derive(Debug, Clone)]
120pub struct ScoreComponent {
121    /// Name of this score component
122    pub name: String,
123    /// Weight applied to this component
124    pub weight: f64,
125    /// Raw score before weighting
126    pub raw_score: f64,
127    /// Weighted contribution to final score
128    pub weighted_score: f64,
129    /// Description of what was compared
130    pub description: String,
131}
132
133impl MatchExplanation {
134    /// Create an explanation for a successful match.
135    pub fn matched(tier: MatchTier, score: f64, reason: impl Into<String>) -> Self {
136        Self {
137            tier,
138            score,
139            reason: reason.into(),
140            score_breakdown: Vec::new(),
141            normalizations_applied: Vec::new(),
142            is_match: true,
143        }
144    }
145
146    /// Create an explanation for a failed match.
147    pub fn no_match(reason: impl Into<String>) -> Self {
148        Self {
149            tier: MatchTier::None,
150            score: 0.0,
151            reason: reason.into(),
152            score_breakdown: Vec::new(),
153            normalizations_applied: Vec::new(),
154            is_match: false,
155        }
156    }
157
158    /// Add a score component to the breakdown.
159    #[must_use]
160    pub fn with_score_component(mut self, component: ScoreComponent) -> Self {
161        self.score_breakdown.push(component);
162        self
163    }
164
165    /// Add a normalization that was applied.
166    #[must_use]
167    pub fn with_normalization(mut self, normalization: impl Into<String>) -> Self {
168        self.normalizations_applied.push(normalization.into());
169        self
170    }
171
172    /// Generate a human-readable summary of the match.
173    #[must_use] 
174    pub fn summary(&self) -> String {
175        if self.is_match {
176            format!(
177                "MATCH ({:.0}% confidence via {:?}): {}",
178                self.score * 100.0,
179                self.tier,
180                self.reason
181            )
182        } else {
183            format!("NO MATCH: {}", self.reason)
184        }
185    }
186
187    /// Generate a detailed multi-line explanation.
188    #[must_use] 
189    pub fn detailed(&self) -> String {
190        let mut lines = vec![self.summary()];
191
192        if !self.score_breakdown.is_empty() {
193            lines.push("Score breakdown:".to_string());
194            for component in &self.score_breakdown {
195                lines.push(format!(
196                    "  - {}: {:.2} × {:.2} = {:.2} ({})",
197                    component.name,
198                    component.raw_score,
199                    component.weight,
200                    component.weighted_score,
201                    component.description
202                ));
203            }
204        }
205
206        if !self.normalizations_applied.is_empty() {
207            lines.push(format!(
208                "Normalizations: {}",
209                self.normalizations_applied.join(", ")
210            ));
211        }
212
213        lines.join("\n")
214    }
215}
216
217impl std::fmt::Display for MatchExplanation {
218    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
219        write!(f, "{}", self.summary())
220    }
221}
222
223/// Trait for component matching strategies.
224///
225/// Implementors provide different strategies for determining if two
226/// components represent the same logical package across SBOMs.
227///
228/// # Example
229///
230/// ```ignore
231/// use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
232///
233/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
234/// let score = matcher.match_score(&component_a, &component_b);
235/// ```
236pub trait ComponentMatcher: Send + Sync {
237    /// Compute a match score between two components.
238    ///
239    /// Returns a score between 0.0 (no match) and 1.0 (perfect match).
240    fn match_score(&self, a: &Component, b: &Component) -> f64;
241
242    /// Compute a detailed match result between two components.
243    ///
244    /// Returns a `MatchResult` with score, tier, and metadata.
245    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
246        let score = self.match_score(a, b);
247        if score > 0.0 {
248            MatchResult::new(score, MatchTier::Fuzzy)
249        } else {
250            MatchResult::no_match()
251        }
252    }
253
254    /// Generate a human-readable explanation of why two components matched or didn't.
255    ///
256    /// Useful for debugging and auditing match decisions.
257    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
258        let result = self.match_detailed(a, b);
259        if result.is_match() {
260            MatchExplanation::matched(
261                result.tier,
262                result.score,
263                format!("'{}' matches '{}' via {:?}", a.name, b.name, result.tier),
264            )
265        } else {
266            MatchExplanation::no_match(format!(
267                "'{}' does not match '{}' (score {:.2} below threshold)",
268                a.name, b.name, result.score
269            ))
270        }
271    }
272
273    /// Find the best matching component from a list of candidates.
274    ///
275    /// Returns the best match and its score, or None if no match meets the threshold.
276    fn find_best_match<'a>(
277        &self,
278        target: &Component,
279        candidates: &'a [&Component],
280        threshold: f64,
281    ) -> Option<(&'a Component, f64)> {
282        candidates
283            .iter()
284            .map(|c| (*c, self.match_score(target, c)))
285            .filter(|(_, score)| *score >= threshold)
286            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
287    }
288
289    /// Get the name of this matcher for logging/debugging.
290    fn name(&self) -> &'static str {
291        "ComponentMatcher"
292    }
293
294    /// Get the minimum threshold this matcher uses for fuzzy matching.
295    fn threshold(&self) -> f64 {
296        0.0
297    }
298}
299
300/// Configuration for the cached matcher.
301#[derive(Debug, Clone)]
302pub struct CacheConfig {
303    /// Maximum number of entries in the cache.
304    pub max_entries: usize,
305    /// Whether to cache detailed results (more memory).
306    pub cache_detailed: bool,
307}
308
309impl Default for CacheConfig {
310    fn default() -> Self {
311        Self {
312            max_entries: 100_000,
313            cache_detailed: false,
314        }
315    }
316}
317
318impl CacheConfig {
319    /// Create a config optimized for small SBOMs.
320    #[must_use] 
321    pub const fn small() -> Self {
322        Self {
323            max_entries: 10_000,
324            cache_detailed: true,
325        }
326    }
327
328    /// Create a config optimized for large SBOMs.
329    #[must_use] 
330    pub const fn large() -> Self {
331        Self {
332            max_entries: 500_000,
333            cache_detailed: false,
334        }
335    }
336}
337
338/// Cache key combining component IDs.
339#[derive(Hash, Eq, PartialEq, Clone)]
340struct CacheKey {
341    hash: u64,
342}
343
344impl CacheKey {
345    fn new(a_id: &str, b_id: &str) -> Self {
346        use xxhash_rust::xxh3::xxh3_64;
347
348        // Create a combined key - order-independent for symmetry
349        let (first, second) = if a_id < b_id {
350            (a_id, b_id)
351        } else {
352            (b_id, a_id)
353        };
354
355        let combined = format!("{first}|{second}");
356        Self {
357            hash: xxh3_64(combined.as_bytes()),
358        }
359    }
360}
361
362/// Cached match result entry.
363#[derive(Clone)]
364struct CacheEntry {
365    score: f64,
366    detailed: Option<MatchResult>,
367}
368
369/// A wrapper that caches match results for performance.
370///
371/// The cache uses component IDs to generate cache keys and stores
372/// match scores for quick lookup. This is particularly effective when
373/// the same component pairs are compared multiple times.
374///
375/// # Example
376///
377/// ```ignore
378/// use sbom_tools::matching::{CachedMatcher, FuzzyMatcher, FuzzyMatchConfig, CacheConfig};
379///
380/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
381/// let cached = CachedMatcher::new(matcher);
382/// // Or with custom config:
383/// let cached = CachedMatcher::with_config(matcher, CacheConfig::large());
384/// ```
385pub struct CachedMatcher<M: ComponentMatcher> {
386    inner: M,
387    config: CacheConfig,
388    cache: std::sync::RwLock<std::collections::HashMap<CacheKey, CacheEntry>>,
389    stats: std::sync::atomic::AtomicUsize,
390    hits: std::sync::atomic::AtomicUsize,
391}
392
393impl<M: ComponentMatcher> CachedMatcher<M> {
394    /// Create a new cached matcher wrapping the given matcher.
395    pub fn new(inner: M) -> Self {
396        Self::with_config(inner, CacheConfig::default())
397    }
398
399    /// Create a cached matcher with custom configuration.
400    pub fn with_config(inner: M, config: CacheConfig) -> Self {
401        Self {
402            inner,
403            config,
404            cache: std::sync::RwLock::new(std::collections::HashMap::new()),
405            stats: std::sync::atomic::AtomicUsize::new(0),
406            hits: std::sync::atomic::AtomicUsize::new(0),
407        }
408    }
409
410    /// Get a reference to the inner matcher.
411    pub const fn inner(&self) -> &M {
412        &self.inner
413    }
414
415    /// Get cache statistics.
416    pub fn cache_stats(&self) -> CacheStats {
417        let total = self.stats.load(std::sync::atomic::Ordering::Relaxed);
418        let hits = self.hits.load(std::sync::atomic::Ordering::Relaxed);
419        let size = self.cache.read().map(|c| c.len()).unwrap_or(0);
420        CacheStats {
421            total_lookups: total,
422            cache_hits: hits,
423            cache_misses: total.saturating_sub(hits),
424            hit_rate: if total > 0 {
425                hits as f64 / total as f64
426            } else {
427                0.0
428            },
429            cache_size: size,
430        }
431    }
432
433    /// Clear the cache.
434    pub fn clear_cache(&self) {
435        if let Ok(mut cache) = self.cache.write() {
436            cache.clear();
437        }
438        self.stats.store(0, std::sync::atomic::Ordering::Relaxed);
439        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
440    }
441
442    /// Try to get a cached score.
443    fn get_cached(&self, key: &CacheKey) -> Option<CacheEntry> {
444        self.stats
445            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
446        if let Ok(cache) = self.cache.read()
447            && let Some(entry) = cache.get(key) {
448                self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
449                return Some(entry.clone());
450            }
451        None
452    }
453
454    /// Store a result in the cache.
455    fn store_cached(&self, key: CacheKey, entry: CacheEntry) {
456        if let Ok(mut cache) = self.cache.write() {
457            // Simple eviction: clear half the cache when full
458            if cache.len() >= self.config.max_entries {
459                let to_remove: Vec<CacheKey> = cache
460                    .keys()
461                    .take(self.config.max_entries / 2)
462                    .cloned()
463                    .collect();
464                for k in to_remove {
465                    cache.remove(&k);
466                }
467            }
468            cache.insert(key, entry);
469        }
470    }
471}
472
473/// Cache statistics.
474#[derive(Debug, Clone)]
475pub struct CacheStats {
476    /// Total number of cache lookups.
477    pub total_lookups: usize,
478    /// Number of cache hits.
479    pub cache_hits: usize,
480    /// Number of cache misses.
481    pub cache_misses: usize,
482    /// Hit rate (0.0 - 1.0).
483    pub hit_rate: f64,
484    /// Current cache size.
485    pub cache_size: usize,
486}
487
488impl<M: ComponentMatcher> ComponentMatcher for CachedMatcher<M> {
489    fn match_score(&self, a: &Component, b: &Component) -> f64 {
490        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
491
492        // Check cache first
493        if let Some(entry) = self.get_cached(&key) {
494            return entry.score;
495        }
496
497        // Compute and cache
498        let score = self.inner.match_score(a, b);
499        self.store_cached(
500            key,
501            CacheEntry {
502                score,
503                detailed: None,
504            },
505        );
506        score
507    }
508
509    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
510        if !self.config.cache_detailed {
511            return self.inner.match_detailed(a, b);
512        }
513
514        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
515
516        // Check cache for detailed result
517        if let Some(entry) = self.get_cached(&key)
518            && let Some(detailed) = entry.detailed {
519                return detailed;
520            }
521
522        // Compute and cache
523        let result = self.inner.match_detailed(a, b);
524        self.store_cached(
525            key,
526            CacheEntry {
527                score: result.score,
528                detailed: Some(result.clone()),
529            },
530        );
531        result
532    }
533
534    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
535        // Don't cache explanations as they're typically for debugging
536        self.inner.explain_match(a, b)
537    }
538
539    fn name(&self) -> &'static str {
540        "CachedMatcher"
541    }
542
543    fn threshold(&self) -> f64 {
544        self.inner.threshold()
545    }
546}
547
548/// A composite matcher that tries multiple strategies in order.
549#[must_use]
550pub struct CompositeMatcherBuilder {
551    matchers: Vec<Box<dyn ComponentMatcher>>,
552}
553
554impl CompositeMatcherBuilder {
555    /// Create a new composite matcher builder.
556    pub fn new() -> Self {
557        Self {
558            matchers: Vec::new(),
559        }
560    }
561
562    /// Add a matcher to the composite.
563    pub fn with_matcher(mut self, matcher: Box<dyn ComponentMatcher>) -> Self {
564        self.matchers.push(matcher);
565        self
566    }
567
568    /// Build the composite matcher.
569    #[must_use] 
570    pub fn build(self) -> CompositeMatcher {
571        CompositeMatcher {
572            matchers: self.matchers,
573        }
574    }
575}
576
577impl Default for CompositeMatcherBuilder {
578    fn default() -> Self {
579        Self::new()
580    }
581}
582
583/// A matcher that combines multiple matching strategies.
584pub struct CompositeMatcher {
585    matchers: Vec<Box<dyn ComponentMatcher>>,
586}
587
588impl ComponentMatcher for CompositeMatcher {
589    fn match_score(&self, a: &Component, b: &Component) -> f64 {
590        // Return the highest score from any matcher
591        self.matchers
592            .iter()
593            .map(|m| m.match_score(a, b))
594            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
595            .unwrap_or(0.0)
596    }
597
598    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
599        // Return the best result from any matcher
600        self.matchers
601            .iter()
602            .map(|m| m.match_detailed(a, b))
603            .max_by(|a, b| {
604                a.score
605                    .partial_cmp(&b.score)
606                    .unwrap_or(std::cmp::Ordering::Equal)
607            })
608            .unwrap_or_else(MatchResult::no_match)
609    }
610
611    fn name(&self) -> &'static str {
612        "CompositeMatcher"
613    }
614}
615
616#[cfg(test)]
617mod tests {
618    use super::*;
619
620    /// A simple test matcher that always returns a fixed score
621    struct FixedScoreMatcher(f64);
622
623    impl ComponentMatcher for FixedScoreMatcher {
624        fn match_score(&self, _a: &Component, _b: &Component) -> f64 {
625            self.0
626        }
627
628        fn name(&self) -> &'static str {
629            "FixedScoreMatcher"
630        }
631    }
632
633    #[test]
634    fn test_match_result_creation() {
635        let result = MatchResult::new(0.95, MatchTier::Alias);
636        assert_eq!(result.score, 0.95);
637        assert_eq!(result.tier, MatchTier::Alias);
638        assert!(result.is_match());
639    }
640
641    #[test]
642    fn test_no_match_result() {
643        let result = MatchResult::no_match();
644        assert_eq!(result.score, 0.0);
645        assert_eq!(result.tier, MatchTier::None);
646        assert!(!result.is_match());
647    }
648
649    #[test]
650    fn test_match_tier_default_scores() {
651        assert_eq!(MatchTier::ExactIdentifier.default_score(), 1.0);
652        assert_eq!(MatchTier::Alias.default_score(), 0.95);
653        assert_eq!(MatchTier::EcosystemRule.default_score(), 0.90);
654        assert_eq!(MatchTier::None.default_score(), 0.0);
655    }
656
657    #[test]
658    fn test_composite_matcher() {
659        let matcher = CompositeMatcherBuilder::new()
660            .with_matcher(Box::new(FixedScoreMatcher(0.5)))
661            .with_matcher(Box::new(FixedScoreMatcher(0.8)))
662            .with_matcher(Box::new(FixedScoreMatcher(0.3)))
663            .build();
664
665        let comp_a = Component::new("test".to_string(), "id-1".to_string());
666        let comp_b = Component::new("test".to_string(), "id-2".to_string());
667
668        // Should return the highest score (0.8)
669        assert_eq!(matcher.match_score(&comp_a, &comp_b), 0.8);
670    }
671
672    #[test]
673    fn test_find_best_match() {
674        let matcher = FixedScoreMatcher(0.85);
675        let target = Component::new("target".to_string(), "id-0".to_string());
676        let candidates: Vec<Component> = vec![
677            Component::new("candidate1".to_string(), "id-1".to_string()),
678            Component::new("candidate2".to_string(), "id-2".to_string()),
679        ];
680        let candidate_refs: Vec<&Component> = candidates.iter().collect();
681
682        // With threshold 0.8, should find a match
683        let result = matcher.find_best_match(&target, &candidate_refs, 0.8);
684        assert!(result.is_some());
685
686        // With threshold 0.9, should not find a match
687        let result = matcher.find_best_match(&target, &candidate_refs, 0.9);
688        assert!(result.is_none());
689    }
690
691    #[test]
692    fn test_match_explanation_matched() {
693        let explanation =
694            MatchExplanation::matched(MatchTier::ExactIdentifier, 1.0, "Test match reason");
695
696        assert!(explanation.is_match);
697        assert_eq!(explanation.score, 1.0);
698        assert_eq!(explanation.tier, MatchTier::ExactIdentifier);
699        assert!(explanation.summary().contains("MATCH"));
700        assert!(explanation.summary().contains("100%"));
701    }
702
703    #[test]
704    fn test_match_explanation_no_match() {
705        let explanation = MatchExplanation::no_match("Components are too different");
706
707        assert!(!explanation.is_match);
708        assert_eq!(explanation.score, 0.0);
709        assert_eq!(explanation.tier, MatchTier::None);
710        assert!(explanation.summary().contains("NO MATCH"));
711    }
712
713    #[test]
714    fn test_match_explanation_with_breakdown() {
715        let explanation = MatchExplanation::matched(MatchTier::Fuzzy, 0.85, "Fuzzy match")
716            .with_score_component(ScoreComponent {
717                name: "Jaro-Winkler".to_string(),
718                weight: 0.7,
719                raw_score: 0.9,
720                weighted_score: 0.63,
721                description: "name similarity".to_string(),
722            })
723            .with_score_component(ScoreComponent {
724                name: "Levenshtein".to_string(),
725                weight: 0.3,
726                raw_score: 0.73,
727                weighted_score: 0.22,
728                description: "edit distance".to_string(),
729            })
730            .with_normalization("lowercase");
731
732        assert_eq!(explanation.score_breakdown.len(), 2);
733        assert_eq!(explanation.normalizations_applied.len(), 1);
734
735        let detailed = explanation.detailed();
736        assert!(detailed.contains("Score breakdown:"));
737        assert!(detailed.contains("Jaro-Winkler"));
738        assert!(detailed.contains("Normalizations: lowercase"));
739    }
740
741    #[test]
742    fn test_match_explanation_display() {
743        let explanation = MatchExplanation::matched(MatchTier::Alias, 0.95, "Known alias");
744        let display = format!("{}", explanation);
745        assert!(display.contains("MATCH"));
746        assert!(display.contains("95%"));
747    }
748}