Skip to main content

sbom_tools/matching/
traits.rs

1//! Trait definitions for component matching strategies.
2//!
3//! This module provides abstractions for component matching, enabling
4//! pluggable matching strategies and easier testing.
5
6use crate::model::Component;
7
8/// Result of matching two components.
9#[derive(Debug, Clone)]
10#[must_use]
11pub struct MatchResult {
12    /// The matching confidence score (0.0 - 1.0)
13    pub score: f64,
14    /// The matching tier that produced this result
15    pub tier: MatchTier,
16    /// Additional metadata about the match
17    pub metadata: MatchMetadata,
18}
19
20impl MatchResult {
21    /// Create a new match result
22    pub fn new(score: f64, tier: MatchTier) -> Self {
23        Self {
24            score,
25            tier,
26            metadata: MatchMetadata::default(),
27        }
28    }
29
30    /// Create a match result with metadata
31    pub const fn with_metadata(score: f64, tier: MatchTier, metadata: MatchMetadata) -> Self {
32        Self {
33            score,
34            tier,
35            metadata,
36        }
37    }
38
39    /// Create a no-match result
40    pub fn no_match() -> Self {
41        Self {
42            score: 0.0,
43            tier: MatchTier::None,
44            metadata: MatchMetadata::default(),
45        }
46    }
47
48    /// Check if this represents a successful match
49    #[must_use]
50    pub fn is_match(&self) -> bool {
51        self.score > 0.0 && self.tier != MatchTier::None
52    }
53}
54
55/// The tier/level at which a match was found.
56#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
57#[non_exhaustive]
58pub enum MatchTier {
59    /// No match found
60    None,
61    /// Exact identifier match (PURL, CPE, etc.)
62    ExactIdentifier,
63    /// Match via alias table
64    Alias,
65    /// Match via ecosystem-specific rules
66    EcosystemRule,
67    /// Match via fuzzy string similarity
68    Fuzzy,
69    /// Match via custom user rules
70    CustomRule,
71}
72
73impl MatchTier {
74    /// Get the default confidence score for this tier
75    #[must_use]
76    pub const fn default_score(&self) -> f64 {
77        match self {
78            Self::None => 0.0,
79            Self::ExactIdentifier => 1.0,
80            Self::Alias => 0.95,
81            Self::EcosystemRule => 0.90,
82            Self::CustomRule => 0.92,
83            Self::Fuzzy => 0.80,
84        }
85    }
86}
87
88/// Additional metadata about a match.
89#[derive(Debug, Clone, Default)]
90pub struct MatchMetadata {
91    /// The field(s) that matched
92    pub matched_fields: Vec<String>,
93    /// The normalization applied, if any
94    pub normalization: Option<String>,
95    /// The rule that produced the match, if applicable
96    pub rule_id: Option<String>,
97}
98
99/// Human-readable explanation of why two components matched (or didn't).
100///
101/// Useful for debugging match decisions and auditing SBOM diff results.
102#[derive(Debug, Clone)]
103pub struct MatchExplanation {
104    /// The matching tier that produced this result
105    pub tier: MatchTier,
106    /// The final confidence score
107    pub score: f64,
108    /// Human-readable reason for the match/non-match
109    pub reason: String,
110    /// Detailed breakdown of score components
111    pub score_breakdown: Vec<ScoreComponent>,
112    /// Normalizations that were applied
113    pub normalizations_applied: Vec<String>,
114    /// Whether this was a successful match
115    pub is_match: bool,
116}
117
118/// A component of the overall match score.
119#[derive(Debug, Clone)]
120pub struct ScoreComponent {
121    /// Name of this score component
122    pub name: String,
123    /// Weight applied to this component
124    pub weight: f64,
125    /// Raw score before weighting
126    pub raw_score: f64,
127    /// Weighted contribution to final score
128    pub weighted_score: f64,
129    /// Description of what was compared
130    pub description: String,
131}
132
133impl MatchExplanation {
134    /// Create an explanation for a successful match.
135    pub fn matched(tier: MatchTier, score: f64, reason: impl Into<String>) -> Self {
136        Self {
137            tier,
138            score,
139            reason: reason.into(),
140            score_breakdown: Vec::new(),
141            normalizations_applied: Vec::new(),
142            is_match: true,
143        }
144    }
145
146    /// Create an explanation for a failed match.
147    pub fn no_match(reason: impl Into<String>) -> Self {
148        Self {
149            tier: MatchTier::None,
150            score: 0.0,
151            reason: reason.into(),
152            score_breakdown: Vec::new(),
153            normalizations_applied: Vec::new(),
154            is_match: false,
155        }
156    }
157
158    /// Add a score component to the breakdown.
159    #[must_use]
160    pub fn with_score_component(mut self, component: ScoreComponent) -> Self {
161        self.score_breakdown.push(component);
162        self
163    }
164
165    /// Add a normalization that was applied.
166    #[must_use]
167    pub fn with_normalization(mut self, normalization: impl Into<String>) -> Self {
168        self.normalizations_applied.push(normalization.into());
169        self
170    }
171
172    /// Generate a human-readable summary of the match.
173    #[must_use]
174    pub fn summary(&self) -> String {
175        if self.is_match {
176            format!(
177                "MATCH ({:.0}% confidence via {:?}): {}",
178                self.score * 100.0,
179                self.tier,
180                self.reason
181            )
182        } else {
183            format!("NO MATCH: {}", self.reason)
184        }
185    }
186
187    /// Generate a detailed multi-line explanation.
188    #[must_use]
189    pub fn detailed(&self) -> String {
190        let mut lines = vec![self.summary()];
191
192        if !self.score_breakdown.is_empty() {
193            lines.push("Score breakdown:".to_string());
194            for component in &self.score_breakdown {
195                lines.push(format!(
196                    "  - {}: {:.2} × {:.2} = {:.2} ({})",
197                    component.name,
198                    component.raw_score,
199                    component.weight,
200                    component.weighted_score,
201                    component.description
202                ));
203            }
204        }
205
206        if !self.normalizations_applied.is_empty() {
207            lines.push(format!(
208                "Normalizations: {}",
209                self.normalizations_applied.join(", ")
210            ));
211        }
212
213        lines.join("\n")
214    }
215}
216
217impl std::fmt::Display for MatchExplanation {
218    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
219        write!(f, "{}", self.summary())
220    }
221}
222
223/// Trait for component matching strategies.
224///
225/// Implementors provide different strategies for determining if two
226/// components represent the same logical package across SBOMs.
227///
228/// # Example
229///
230/// ```ignore
231/// use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
232///
233/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
234/// let score = matcher.match_score(&component_a, &component_b);
235/// ```
236pub trait ComponentMatcher: Send + Sync {
237    /// Compute a match score between two components.
238    ///
239    /// Returns a score between 0.0 (no match) and 1.0 (perfect match).
240    fn match_score(&self, a: &Component, b: &Component) -> f64;
241
242    /// Compute a detailed match result between two components.
243    ///
244    /// Returns a `MatchResult` with score, tier, and metadata.
245    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
246        let score = self.match_score(a, b);
247        if score > 0.0 {
248            MatchResult::new(score, MatchTier::Fuzzy)
249        } else {
250            MatchResult::no_match()
251        }
252    }
253
254    /// Generate a human-readable explanation of why two components matched or didn't.
255    ///
256    /// Useful for debugging and auditing match decisions.
257    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
258        let result = self.match_detailed(a, b);
259        if result.is_match() {
260            MatchExplanation::matched(
261                result.tier,
262                result.score,
263                format!("'{}' matches '{}' via {:?}", a.name, b.name, result.tier),
264            )
265        } else {
266            MatchExplanation::no_match(format!(
267                "'{}' does not match '{}' (score {:.2} below threshold)",
268                a.name, b.name, result.score
269            ))
270        }
271    }
272
273    /// Find the best matching component from a list of candidates.
274    ///
275    /// Returns the best match and its score, or None if no match meets the threshold.
276    fn find_best_match<'a>(
277        &self,
278        target: &Component,
279        candidates: &'a [&Component],
280        threshold: f64,
281    ) -> Option<(&'a Component, f64)> {
282        candidates
283            .iter()
284            .map(|c| (*c, self.match_score(target, c)))
285            .filter(|(_, score)| *score >= threshold)
286            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
287    }
288
289    /// Get the name of this matcher for logging/debugging.
290    fn name(&self) -> &'static str {
291        "ComponentMatcher"
292    }
293
294    /// Get the minimum threshold this matcher uses for fuzzy matching.
295    fn threshold(&self) -> f64 {
296        0.0
297    }
298}
299
300/// Configuration for the cached matcher.
301#[derive(Debug, Clone)]
302pub struct CacheConfig {
303    /// Maximum number of entries in the cache.
304    pub max_entries: usize,
305    /// Whether to cache detailed results (more memory).
306    pub cache_detailed: bool,
307}
308
309impl Default for CacheConfig {
310    fn default() -> Self {
311        Self {
312            max_entries: 100_000,
313            cache_detailed: false,
314        }
315    }
316}
317
318impl CacheConfig {
319    /// Create a config optimized for small SBOMs.
320    #[must_use]
321    pub const fn small() -> Self {
322        Self {
323            max_entries: 10_000,
324            cache_detailed: true,
325        }
326    }
327
328    /// Create a config optimized for large SBOMs.
329    #[must_use]
330    pub const fn large() -> Self {
331        Self {
332            max_entries: 500_000,
333            cache_detailed: false,
334        }
335    }
336}
337
338/// Cache key combining component IDs.
339#[derive(Hash, Eq, PartialEq, Clone)]
340struct CacheKey {
341    hash: u64,
342}
343
344impl CacheKey {
345    fn new(a_id: &str, b_id: &str) -> Self {
346        use xxhash_rust::xxh3::xxh3_64;
347
348        // Create a combined key - order-independent for symmetry
349        let (first, second) = if a_id < b_id {
350            (a_id, b_id)
351        } else {
352            (b_id, a_id)
353        };
354
355        let combined = format!("{first}|{second}");
356        Self {
357            hash: xxh3_64(combined.as_bytes()),
358        }
359    }
360}
361
362/// Cached match result entry.
363#[derive(Clone)]
364struct CacheEntry {
365    score: f64,
366    detailed: Option<MatchResult>,
367}
368
369/// A wrapper that caches match results for performance.
370///
371/// The cache uses component IDs to generate cache keys and stores
372/// match scores for quick lookup. This is particularly effective when
373/// the same component pairs are compared multiple times.
374///
375/// # Example
376///
377/// ```ignore
378/// use sbom_tools::matching::{CachedMatcher, FuzzyMatcher, FuzzyMatchConfig, CacheConfig};
379///
380/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
381/// let cached = CachedMatcher::new(matcher);
382/// // Or with custom config:
383/// let cached = CachedMatcher::with_config(matcher, CacheConfig::large());
384/// ```
385pub struct CachedMatcher<M: ComponentMatcher> {
386    inner: M,
387    config: CacheConfig,
388    cache: std::sync::RwLock<std::collections::HashMap<CacheKey, CacheEntry>>,
389    stats: std::sync::atomic::AtomicUsize,
390    hits: std::sync::atomic::AtomicUsize,
391}
392
393impl<M: ComponentMatcher> CachedMatcher<M> {
394    /// Create a new cached matcher wrapping the given matcher.
395    pub fn new(inner: M) -> Self {
396        Self::with_config(inner, CacheConfig::default())
397    }
398
399    /// Create a cached matcher with custom configuration.
400    pub fn with_config(inner: M, config: CacheConfig) -> Self {
401        Self {
402            inner,
403            config,
404            cache: std::sync::RwLock::new(std::collections::HashMap::new()),
405            stats: std::sync::atomic::AtomicUsize::new(0),
406            hits: std::sync::atomic::AtomicUsize::new(0),
407        }
408    }
409
410    /// Get a reference to the inner matcher.
411    pub const fn inner(&self) -> &M {
412        &self.inner
413    }
414
415    /// Get cache statistics.
416    pub fn cache_stats(&self) -> CacheStats {
417        let total = self.stats.load(std::sync::atomic::Ordering::Relaxed);
418        let hits = self.hits.load(std::sync::atomic::Ordering::Relaxed);
419        let size = self.cache.read().map(|c| c.len()).unwrap_or(0);
420        CacheStats {
421            total_lookups: total,
422            cache_hits: hits,
423            cache_misses: total.saturating_sub(hits),
424            hit_rate: if total > 0 {
425                hits as f64 / total as f64
426            } else {
427                0.0
428            },
429            cache_size: size,
430        }
431    }
432
433    /// Clear the cache.
434    pub fn clear_cache(&self) {
435        if let Ok(mut cache) = self.cache.write() {
436            cache.clear();
437        }
438        self.stats.store(0, std::sync::atomic::Ordering::Relaxed);
439        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
440    }
441
442    /// Try to get a cached score.
443    fn get_cached(&self, key: &CacheKey) -> Option<CacheEntry> {
444        self.stats
445            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
446        if let Ok(cache) = self.cache.read()
447            && let Some(entry) = cache.get(key)
448        {
449            self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
450            return Some(entry.clone());
451        }
452        None
453    }
454
455    /// Store a result in the cache.
456    fn store_cached(&self, key: CacheKey, entry: CacheEntry) {
457        if let Ok(mut cache) = self.cache.write() {
458            // Simple eviction: clear half the cache when full
459            if cache.len() >= self.config.max_entries {
460                let to_remove: Vec<CacheKey> = cache
461                    .keys()
462                    .take(self.config.max_entries / 2)
463                    .cloned()
464                    .collect();
465                for k in to_remove {
466                    cache.remove(&k);
467                }
468            }
469            cache.insert(key, entry);
470        }
471    }
472}
473
474/// Cache statistics.
475#[derive(Debug, Clone)]
476pub struct CacheStats {
477    /// Total number of cache lookups.
478    pub total_lookups: usize,
479    /// Number of cache hits.
480    pub cache_hits: usize,
481    /// Number of cache misses.
482    pub cache_misses: usize,
483    /// Hit rate (0.0 - 1.0).
484    pub hit_rate: f64,
485    /// Current cache size.
486    pub cache_size: usize,
487}
488
489impl<M: ComponentMatcher> ComponentMatcher for CachedMatcher<M> {
490    fn match_score(&self, a: &Component, b: &Component) -> f64 {
491        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
492
493        // Check cache first
494        if let Some(entry) = self.get_cached(&key) {
495            return entry.score;
496        }
497
498        // Compute and cache
499        let score = self.inner.match_score(a, b);
500        self.store_cached(
501            key,
502            CacheEntry {
503                score,
504                detailed: None,
505            },
506        );
507        score
508    }
509
510    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
511        if !self.config.cache_detailed {
512            return self.inner.match_detailed(a, b);
513        }
514
515        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
516
517        // Check cache for detailed result
518        if let Some(entry) = self.get_cached(&key)
519            && let Some(detailed) = entry.detailed
520        {
521            return detailed;
522        }
523
524        // Compute and cache
525        let result = self.inner.match_detailed(a, b);
526        self.store_cached(
527            key,
528            CacheEntry {
529                score: result.score,
530                detailed: Some(result.clone()),
531            },
532        );
533        result
534    }
535
536    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
537        // Don't cache explanations as they're typically for debugging
538        self.inner.explain_match(a, b)
539    }
540
541    fn name(&self) -> &'static str {
542        "CachedMatcher"
543    }
544
545    fn threshold(&self) -> f64 {
546        self.inner.threshold()
547    }
548}
549
550/// A composite matcher that tries multiple strategies in order.
551#[must_use]
552pub struct CompositeMatcherBuilder {
553    matchers: Vec<Box<dyn ComponentMatcher>>,
554}
555
556impl CompositeMatcherBuilder {
557    /// Create a new composite matcher builder.
558    pub fn new() -> Self {
559        Self {
560            matchers: Vec::new(),
561        }
562    }
563
564    /// Add a matcher to the composite.
565    pub fn with_matcher(mut self, matcher: Box<dyn ComponentMatcher>) -> Self {
566        self.matchers.push(matcher);
567        self
568    }
569
570    /// Build the composite matcher.
571    #[must_use]
572    pub fn build(self) -> CompositeMatcher {
573        CompositeMatcher {
574            matchers: self.matchers,
575        }
576    }
577}
578
579impl Default for CompositeMatcherBuilder {
580    fn default() -> Self {
581        Self::new()
582    }
583}
584
585/// A matcher that combines multiple matching strategies.
586pub struct CompositeMatcher {
587    matchers: Vec<Box<dyn ComponentMatcher>>,
588}
589
590impl ComponentMatcher for CompositeMatcher {
591    fn match_score(&self, a: &Component, b: &Component) -> f64 {
592        // Return the highest score from any matcher
593        self.matchers
594            .iter()
595            .map(|m| m.match_score(a, b))
596            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
597            .unwrap_or(0.0)
598    }
599
600    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
601        // Return the best result from any matcher
602        self.matchers
603            .iter()
604            .map(|m| m.match_detailed(a, b))
605            .max_by(|a, b| {
606                a.score
607                    .partial_cmp(&b.score)
608                    .unwrap_or(std::cmp::Ordering::Equal)
609            })
610            .unwrap_or_else(MatchResult::no_match)
611    }
612
613    fn name(&self) -> &'static str {
614        "CompositeMatcher"
615    }
616}
617
618#[cfg(test)]
619mod tests {
620    use super::*;
621
622    /// A simple test matcher that always returns a fixed score
623    struct FixedScoreMatcher(f64);
624
625    impl ComponentMatcher for FixedScoreMatcher {
626        fn match_score(&self, _a: &Component, _b: &Component) -> f64 {
627            self.0
628        }
629
630        fn name(&self) -> &'static str {
631            "FixedScoreMatcher"
632        }
633    }
634
635    #[test]
636    fn test_match_result_creation() {
637        let result = MatchResult::new(0.95, MatchTier::Alias);
638        assert_eq!(result.score, 0.95);
639        assert_eq!(result.tier, MatchTier::Alias);
640        assert!(result.is_match());
641    }
642
643    #[test]
644    fn test_no_match_result() {
645        let result = MatchResult::no_match();
646        assert_eq!(result.score, 0.0);
647        assert_eq!(result.tier, MatchTier::None);
648        assert!(!result.is_match());
649    }
650
651    #[test]
652    fn test_match_tier_default_scores() {
653        assert_eq!(MatchTier::ExactIdentifier.default_score(), 1.0);
654        assert_eq!(MatchTier::Alias.default_score(), 0.95);
655        assert_eq!(MatchTier::EcosystemRule.default_score(), 0.90);
656        assert_eq!(MatchTier::None.default_score(), 0.0);
657    }
658
659    #[test]
660    fn test_composite_matcher() {
661        let matcher = CompositeMatcherBuilder::new()
662            .with_matcher(Box::new(FixedScoreMatcher(0.5)))
663            .with_matcher(Box::new(FixedScoreMatcher(0.8)))
664            .with_matcher(Box::new(FixedScoreMatcher(0.3)))
665            .build();
666
667        let comp_a = Component::new("test".to_string(), "id-1".to_string());
668        let comp_b = Component::new("test".to_string(), "id-2".to_string());
669
670        // Should return the highest score (0.8)
671        assert_eq!(matcher.match_score(&comp_a, &comp_b), 0.8);
672    }
673
674    #[test]
675    fn test_find_best_match() {
676        let matcher = FixedScoreMatcher(0.85);
677        let target = Component::new("target".to_string(), "id-0".to_string());
678        let candidates: Vec<Component> = vec![
679            Component::new("candidate1".to_string(), "id-1".to_string()),
680            Component::new("candidate2".to_string(), "id-2".to_string()),
681        ];
682        let candidate_refs: Vec<&Component> = candidates.iter().collect();
683
684        // With threshold 0.8, should find a match
685        let result = matcher.find_best_match(&target, &candidate_refs, 0.8);
686        assert!(result.is_some());
687
688        // With threshold 0.9, should not find a match
689        let result = matcher.find_best_match(&target, &candidate_refs, 0.9);
690        assert!(result.is_none());
691    }
692
693    #[test]
694    fn test_match_explanation_matched() {
695        let explanation =
696            MatchExplanation::matched(MatchTier::ExactIdentifier, 1.0, "Test match reason");
697
698        assert!(explanation.is_match);
699        assert_eq!(explanation.score, 1.0);
700        assert_eq!(explanation.tier, MatchTier::ExactIdentifier);
701        assert!(explanation.summary().contains("MATCH"));
702        assert!(explanation.summary().contains("100%"));
703    }
704
705    #[test]
706    fn test_match_explanation_no_match() {
707        let explanation = MatchExplanation::no_match("Components are too different");
708
709        assert!(!explanation.is_match);
710        assert_eq!(explanation.score, 0.0);
711        assert_eq!(explanation.tier, MatchTier::None);
712        assert!(explanation.summary().contains("NO MATCH"));
713    }
714
715    #[test]
716    fn test_match_explanation_with_breakdown() {
717        let explanation = MatchExplanation::matched(MatchTier::Fuzzy, 0.85, "Fuzzy match")
718            .with_score_component(ScoreComponent {
719                name: "Jaro-Winkler".to_string(),
720                weight: 0.7,
721                raw_score: 0.9,
722                weighted_score: 0.63,
723                description: "name similarity".to_string(),
724            })
725            .with_score_component(ScoreComponent {
726                name: "Levenshtein".to_string(),
727                weight: 0.3,
728                raw_score: 0.73,
729                weighted_score: 0.22,
730                description: "edit distance".to_string(),
731            })
732            .with_normalization("lowercase");
733
734        assert_eq!(explanation.score_breakdown.len(), 2);
735        assert_eq!(explanation.normalizations_applied.len(), 1);
736
737        let detailed = explanation.detailed();
738        assert!(detailed.contains("Score breakdown:"));
739        assert!(detailed.contains("Jaro-Winkler"));
740        assert!(detailed.contains("Normalizations: lowercase"));
741    }
742
743    #[test]
744    fn test_match_explanation_display() {
745        let explanation = MatchExplanation::matched(MatchTier::Alias, 0.95, "Known alias");
746        let display = format!("{}", explanation);
747        assert!(display.contains("MATCH"));
748        assert!(display.contains("95%"));
749    }
750}