Skip to main content

sbom_tools/matching/
traits.rs

1//! Trait definitions for component matching strategies.
2//!
3//! This module provides abstractions for component matching, enabling
4//! pluggable matching strategies and easier testing.
5
6use crate::model::Component;
7
8/// Result of matching two components.
9#[derive(Debug, Clone)]
10pub struct MatchResult {
11    /// The matching confidence score (0.0 - 1.0)
12    pub score: f64,
13    /// The matching tier that produced this result
14    pub tier: MatchTier,
15    /// Additional metadata about the match
16    pub metadata: MatchMetadata,
17}
18
19impl MatchResult {
20    /// Create a new match result
21    pub fn new(score: f64, tier: MatchTier) -> Self {
22        Self {
23            score,
24            tier,
25            metadata: MatchMetadata::default(),
26        }
27    }
28
29    /// Create a match result with metadata
30    pub fn with_metadata(score: f64, tier: MatchTier, metadata: MatchMetadata) -> Self {
31        Self {
32            score,
33            tier,
34            metadata,
35        }
36    }
37
38    /// Create a no-match result
39    pub fn no_match() -> Self {
40        Self {
41            score: 0.0,
42            tier: MatchTier::None,
43            metadata: MatchMetadata::default(),
44        }
45    }
46
47    /// Check if this represents a successful match
48    pub fn is_match(&self) -> bool {
49        self.score > 0.0 && self.tier != MatchTier::None
50    }
51}
52
53/// The tier/level at which a match was found.
54#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
55pub enum MatchTier {
56    /// No match found
57    None,
58    /// Exact identifier match (PURL, CPE, etc.)
59    ExactIdentifier,
60    /// Match via alias table
61    Alias,
62    /// Match via ecosystem-specific rules
63    EcosystemRule,
64    /// Match via fuzzy string similarity
65    Fuzzy,
66    /// Match via custom user rules
67    CustomRule,
68    /// Match via ML embedding similarity
69    #[cfg(feature = "ml-matching")]
70    Embedding,
71}
72
73impl MatchTier {
74    /// Get the default confidence score for this tier
75    pub fn default_score(&self) -> f64 {
76        match self {
77            MatchTier::None => 0.0,
78            MatchTier::ExactIdentifier => 1.0,
79            MatchTier::Alias => 0.95,
80            MatchTier::EcosystemRule => 0.90,
81            MatchTier::CustomRule => 0.92,
82            MatchTier::Fuzzy => 0.80,
83            #[cfg(feature = "ml-matching")]
84            MatchTier::Embedding => 0.85,
85        }
86    }
87}
88
89/// Additional metadata about a match.
90#[derive(Debug, Clone, Default)]
91pub struct MatchMetadata {
92    /// The field(s) that matched
93    pub matched_fields: Vec<String>,
94    /// The normalization applied, if any
95    pub normalization: Option<String>,
96    /// The rule that produced the match, if applicable
97    pub rule_id: Option<String>,
98}
99
100/// Human-readable explanation of why two components matched (or didn't).
101///
102/// Useful for debugging match decisions and auditing SBOM diff results.
103#[derive(Debug, Clone)]
104pub struct MatchExplanation {
105    /// The matching tier that produced this result
106    pub tier: MatchTier,
107    /// The final confidence score
108    pub score: f64,
109    /// Human-readable reason for the match/non-match
110    pub reason: String,
111    /// Detailed breakdown of score components
112    pub score_breakdown: Vec<ScoreComponent>,
113    /// Normalizations that were applied
114    pub normalizations_applied: Vec<String>,
115    /// Whether this was a successful match
116    pub is_match: bool,
117}
118
119/// A component of the overall match score.
120#[derive(Debug, Clone)]
121pub struct ScoreComponent {
122    /// Name of this score component
123    pub name: String,
124    /// Weight applied to this component
125    pub weight: f64,
126    /// Raw score before weighting
127    pub raw_score: f64,
128    /// Weighted contribution to final score
129    pub weighted_score: f64,
130    /// Description of what was compared
131    pub description: String,
132}
133
134impl MatchExplanation {
135    /// Create an explanation for a successful match.
136    pub fn matched(tier: MatchTier, score: f64, reason: impl Into<String>) -> Self {
137        Self {
138            tier,
139            score,
140            reason: reason.into(),
141            score_breakdown: Vec::new(),
142            normalizations_applied: Vec::new(),
143            is_match: true,
144        }
145    }
146
147    /// Create an explanation for a failed match.
148    pub fn no_match(reason: impl Into<String>) -> Self {
149        Self {
150            tier: MatchTier::None,
151            score: 0.0,
152            reason: reason.into(),
153            score_breakdown: Vec::new(),
154            normalizations_applied: Vec::new(),
155            is_match: false,
156        }
157    }
158
159    /// Add a score component to the breakdown.
160    pub fn with_score_component(mut self, component: ScoreComponent) -> Self {
161        self.score_breakdown.push(component);
162        self
163    }
164
165    /// Add a normalization that was applied.
166    pub fn with_normalization(mut self, normalization: impl Into<String>) -> Self {
167        self.normalizations_applied.push(normalization.into());
168        self
169    }
170
171    /// Generate a human-readable summary of the match.
172    pub fn summary(&self) -> String {
173        if self.is_match {
174            format!(
175                "MATCH ({:.0}% confidence via {:?}): {}",
176                self.score * 100.0,
177                self.tier,
178                self.reason
179            )
180        } else {
181            format!("NO MATCH: {}", self.reason)
182        }
183    }
184
185    /// Generate a detailed multi-line explanation.
186    pub fn detailed(&self) -> String {
187        let mut lines = vec![self.summary()];
188
189        if !self.score_breakdown.is_empty() {
190            lines.push("Score breakdown:".to_string());
191            for component in &self.score_breakdown {
192                lines.push(format!(
193                    "  - {}: {:.2} × {:.2} = {:.2} ({})",
194                    component.name,
195                    component.raw_score,
196                    component.weight,
197                    component.weighted_score,
198                    component.description
199                ));
200            }
201        }
202
203        if !self.normalizations_applied.is_empty() {
204            lines.push(format!(
205                "Normalizations: {}",
206                self.normalizations_applied.join(", ")
207            ));
208        }
209
210        lines.join("\n")
211    }
212}
213
214impl std::fmt::Display for MatchExplanation {
215    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
216        write!(f, "{}", self.summary())
217    }
218}
219
220/// Trait for component matching strategies.
221///
222/// Implementors provide different strategies for determining if two
223/// components represent the same logical package across SBOMs.
224///
225/// # Example
226///
227/// ```ignore
228/// use sbom_tools::matching::{ComponentMatcher, FuzzyMatcher, FuzzyMatchConfig};
229///
230/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
231/// let score = matcher.match_score(&component_a, &component_b);
232/// ```
233pub trait ComponentMatcher: Send + Sync {
234    /// Compute a match score between two components.
235    ///
236    /// Returns a score between 0.0 (no match) and 1.0 (perfect match).
237    fn match_score(&self, a: &Component, b: &Component) -> f64;
238
239    /// Compute a detailed match result between two components.
240    ///
241    /// Returns a `MatchResult` with score, tier, and metadata.
242    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
243        let score = self.match_score(a, b);
244        if score > 0.0 {
245            MatchResult::new(score, MatchTier::Fuzzy)
246        } else {
247            MatchResult::no_match()
248        }
249    }
250
251    /// Generate a human-readable explanation of why two components matched or didn't.
252    ///
253    /// Useful for debugging and auditing match decisions.
254    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
255        let result = self.match_detailed(a, b);
256        if result.is_match() {
257            MatchExplanation::matched(
258                result.tier,
259                result.score,
260                format!("'{}' matches '{}' via {:?}", a.name, b.name, result.tier),
261            )
262        } else {
263            MatchExplanation::no_match(format!(
264                "'{}' does not match '{}' (score {:.2} below threshold)",
265                a.name, b.name, result.score
266            ))
267        }
268    }
269
270    /// Find the best matching component from a list of candidates.
271    ///
272    /// Returns the best match and its score, or None if no match meets the threshold.
273    fn find_best_match<'a>(
274        &self,
275        target: &Component,
276        candidates: &'a [&Component],
277        threshold: f64,
278    ) -> Option<(&'a Component, f64)> {
279        candidates
280            .iter()
281            .map(|c| (*c, self.match_score(target, c)))
282            .filter(|(_, score)| *score >= threshold)
283            .max_by(|(_, a), (_, b)| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
284    }
285
286    /// Get the name of this matcher for logging/debugging.
287    fn name(&self) -> &str {
288        "ComponentMatcher"
289    }
290
291    /// Get the minimum threshold this matcher uses for fuzzy matching.
292    fn threshold(&self) -> f64 {
293        0.0
294    }
295}
296
297/// Configuration for the cached matcher.
298#[derive(Debug, Clone)]
299pub struct CacheConfig {
300    /// Maximum number of entries in the cache.
301    pub max_entries: usize,
302    /// Whether to cache detailed results (more memory).
303    pub cache_detailed: bool,
304}
305
306impl Default for CacheConfig {
307    fn default() -> Self {
308        Self {
309            max_entries: 100_000,
310            cache_detailed: false,
311        }
312    }
313}
314
315impl CacheConfig {
316    /// Create a config optimized for small SBOMs.
317    pub fn small() -> Self {
318        Self {
319            max_entries: 10_000,
320            cache_detailed: true,
321        }
322    }
323
324    /// Create a config optimized for large SBOMs.
325    pub fn large() -> Self {
326        Self {
327            max_entries: 500_000,
328            cache_detailed: false,
329        }
330    }
331}
332
333/// Cache key combining component IDs.
334#[derive(Hash, Eq, PartialEq, Clone)]
335struct CacheKey {
336    hash: u64,
337}
338
339impl CacheKey {
340    fn new(a_id: &str, b_id: &str) -> Self {
341        use xxhash_rust::xxh3::xxh3_64;
342
343        // Create a combined key - order-independent for symmetry
344        let (first, second) = if a_id < b_id {
345            (a_id, b_id)
346        } else {
347            (b_id, a_id)
348        };
349
350        let combined = format!("{}|{}", first, second);
351        Self {
352            hash: xxh3_64(combined.as_bytes()),
353        }
354    }
355}
356
357/// Cached match result entry.
358#[derive(Clone)]
359struct CacheEntry {
360    score: f64,
361    detailed: Option<MatchResult>,
362}
363
364/// A wrapper that caches match results for performance.
365///
366/// The cache uses component IDs to generate cache keys and stores
367/// match scores for quick lookup. This is particularly effective when
368/// the same component pairs are compared multiple times.
369///
370/// # Example
371///
372/// ```ignore
373/// use sbom_tools::matching::{CachedMatcher, FuzzyMatcher, FuzzyMatchConfig, CacheConfig};
374///
375/// let matcher = FuzzyMatcher::new(FuzzyMatchConfig::balanced());
376/// let cached = CachedMatcher::new(matcher);
377/// // Or with custom config:
378/// let cached = CachedMatcher::with_config(matcher, CacheConfig::large());
379/// ```
380pub struct CachedMatcher<M: ComponentMatcher> {
381    inner: M,
382    config: CacheConfig,
383    cache: std::sync::RwLock<std::collections::HashMap<CacheKey, CacheEntry>>,
384    stats: std::sync::atomic::AtomicUsize,
385    hits: std::sync::atomic::AtomicUsize,
386}
387
388impl<M: ComponentMatcher> CachedMatcher<M> {
389    /// Create a new cached matcher wrapping the given matcher.
390    pub fn new(inner: M) -> Self {
391        Self::with_config(inner, CacheConfig::default())
392    }
393
394    /// Create a cached matcher with custom configuration.
395    pub fn with_config(inner: M, config: CacheConfig) -> Self {
396        Self {
397            inner,
398            config,
399            cache: std::sync::RwLock::new(std::collections::HashMap::new()),
400            stats: std::sync::atomic::AtomicUsize::new(0),
401            hits: std::sync::atomic::AtomicUsize::new(0),
402        }
403    }
404
405    /// Get a reference to the inner matcher.
406    pub fn inner(&self) -> &M {
407        &self.inner
408    }
409
410    /// Get cache statistics.
411    pub fn cache_stats(&self) -> CacheStats {
412        let total = self.stats.load(std::sync::atomic::Ordering::Relaxed);
413        let hits = self.hits.load(std::sync::atomic::Ordering::Relaxed);
414        let size = self.cache.read().map(|c| c.len()).unwrap_or(0);
415        CacheStats {
416            total_lookups: total,
417            cache_hits: hits,
418            cache_misses: total.saturating_sub(hits),
419            hit_rate: if total > 0 {
420                hits as f64 / total as f64
421            } else {
422                0.0
423            },
424            cache_size: size,
425        }
426    }
427
428    /// Clear the cache.
429    pub fn clear_cache(&self) {
430        if let Ok(mut cache) = self.cache.write() {
431            cache.clear();
432        }
433        self.stats.store(0, std::sync::atomic::Ordering::Relaxed);
434        self.hits.store(0, std::sync::atomic::Ordering::Relaxed);
435    }
436
437    /// Try to get a cached score.
438    fn get_cached(&self, key: &CacheKey) -> Option<CacheEntry> {
439        self.stats
440            .fetch_add(1, std::sync::atomic::Ordering::Relaxed);
441        if let Ok(cache) = self.cache.read() {
442            if let Some(entry) = cache.get(key) {
443                self.hits.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
444                return Some(entry.clone());
445            }
446        }
447        None
448    }
449
450    /// Store a result in the cache.
451    fn store_cached(&self, key: CacheKey, entry: CacheEntry) {
452        if let Ok(mut cache) = self.cache.write() {
453            // Simple eviction: clear half the cache when full
454            if cache.len() >= self.config.max_entries {
455                let to_remove: Vec<CacheKey> = cache
456                    .keys()
457                    .take(self.config.max_entries / 2)
458                    .cloned()
459                    .collect();
460                for k in to_remove {
461                    cache.remove(&k);
462                }
463            }
464            cache.insert(key, entry);
465        }
466    }
467}
468
469/// Cache statistics.
470#[derive(Debug, Clone)]
471pub struct CacheStats {
472    /// Total number of cache lookups.
473    pub total_lookups: usize,
474    /// Number of cache hits.
475    pub cache_hits: usize,
476    /// Number of cache misses.
477    pub cache_misses: usize,
478    /// Hit rate (0.0 - 1.0).
479    pub hit_rate: f64,
480    /// Current cache size.
481    pub cache_size: usize,
482}
483
484impl<M: ComponentMatcher> ComponentMatcher for CachedMatcher<M> {
485    fn match_score(&self, a: &Component, b: &Component) -> f64 {
486        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
487
488        // Check cache first
489        if let Some(entry) = self.get_cached(&key) {
490            return entry.score;
491        }
492
493        // Compute and cache
494        let score = self.inner.match_score(a, b);
495        self.store_cached(
496            key,
497            CacheEntry {
498                score,
499                detailed: None,
500            },
501        );
502        score
503    }
504
505    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
506        if !self.config.cache_detailed {
507            return self.inner.match_detailed(a, b);
508        }
509
510        let key = CacheKey::new(a.canonical_id.value(), b.canonical_id.value());
511
512        // Check cache for detailed result
513        if let Some(entry) = self.get_cached(&key) {
514            if let Some(detailed) = entry.detailed {
515                return detailed;
516            }
517        }
518
519        // Compute and cache
520        let result = self.inner.match_detailed(a, b);
521        self.store_cached(
522            key,
523            CacheEntry {
524                score: result.score,
525                detailed: Some(result.clone()),
526            },
527        );
528        result
529    }
530
531    fn explain_match(&self, a: &Component, b: &Component) -> MatchExplanation {
532        // Don't cache explanations as they're typically for debugging
533        self.inner.explain_match(a, b)
534    }
535
536    fn name(&self) -> &str {
537        "CachedMatcher"
538    }
539
540    fn threshold(&self) -> f64 {
541        self.inner.threshold()
542    }
543}
544
545/// A composite matcher that tries multiple strategies in order.
546pub struct CompositeMatcherBuilder {
547    matchers: Vec<Box<dyn ComponentMatcher>>,
548}
549
550impl CompositeMatcherBuilder {
551    /// Create a new composite matcher builder.
552    pub fn new() -> Self {
553        Self {
554            matchers: Vec::new(),
555        }
556    }
557
558    /// Add a matcher to the composite.
559    pub fn with_matcher(mut self, matcher: Box<dyn ComponentMatcher>) -> Self {
560        self.matchers.push(matcher);
561        self
562    }
563
564    /// Build the composite matcher.
565    pub fn build(self) -> CompositeMatcher {
566        CompositeMatcher {
567            matchers: self.matchers,
568        }
569    }
570}
571
572impl Default for CompositeMatcherBuilder {
573    fn default() -> Self {
574        Self::new()
575    }
576}
577
578/// A matcher that combines multiple matching strategies.
579pub struct CompositeMatcher {
580    matchers: Vec<Box<dyn ComponentMatcher>>,
581}
582
583impl ComponentMatcher for CompositeMatcher {
584    fn match_score(&self, a: &Component, b: &Component) -> f64 {
585        // Return the highest score from any matcher
586        self.matchers
587            .iter()
588            .map(|m| m.match_score(a, b))
589            .max_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal))
590            .unwrap_or(0.0)
591    }
592
593    fn match_detailed(&self, a: &Component, b: &Component) -> MatchResult {
594        // Return the best result from any matcher
595        self.matchers
596            .iter()
597            .map(|m| m.match_detailed(a, b))
598            .max_by(|a, b| {
599                a.score
600                    .partial_cmp(&b.score)
601                    .unwrap_or(std::cmp::Ordering::Equal)
602            })
603            .unwrap_or_else(MatchResult::no_match)
604    }
605
606    fn name(&self) -> &str {
607        "CompositeMatcher"
608    }
609}
610
611#[cfg(test)]
612mod tests {
613    use super::*;
614
615    /// A simple test matcher that always returns a fixed score
616    struct FixedScoreMatcher(f64);
617
618    impl ComponentMatcher for FixedScoreMatcher {
619        fn match_score(&self, _a: &Component, _b: &Component) -> f64 {
620            self.0
621        }
622
623        fn name(&self) -> &str {
624            "FixedScoreMatcher"
625        }
626    }
627
628    #[test]
629    fn test_match_result_creation() {
630        let result = MatchResult::new(0.95, MatchTier::Alias);
631        assert_eq!(result.score, 0.95);
632        assert_eq!(result.tier, MatchTier::Alias);
633        assert!(result.is_match());
634    }
635
636    #[test]
637    fn test_no_match_result() {
638        let result = MatchResult::no_match();
639        assert_eq!(result.score, 0.0);
640        assert_eq!(result.tier, MatchTier::None);
641        assert!(!result.is_match());
642    }
643
644    #[test]
645    fn test_match_tier_default_scores() {
646        assert_eq!(MatchTier::ExactIdentifier.default_score(), 1.0);
647        assert_eq!(MatchTier::Alias.default_score(), 0.95);
648        assert_eq!(MatchTier::EcosystemRule.default_score(), 0.90);
649        assert_eq!(MatchTier::None.default_score(), 0.0);
650    }
651
652    #[test]
653    fn test_composite_matcher() {
654        let matcher = CompositeMatcherBuilder::new()
655            .with_matcher(Box::new(FixedScoreMatcher(0.5)))
656            .with_matcher(Box::new(FixedScoreMatcher(0.8)))
657            .with_matcher(Box::new(FixedScoreMatcher(0.3)))
658            .build();
659
660        let comp_a = Component::new("test".to_string(), "id-1".to_string());
661        let comp_b = Component::new("test".to_string(), "id-2".to_string());
662
663        // Should return the highest score (0.8)
664        assert_eq!(matcher.match_score(&comp_a, &comp_b), 0.8);
665    }
666
667    #[test]
668    fn test_find_best_match() {
669        let matcher = FixedScoreMatcher(0.85);
670        let target = Component::new("target".to_string(), "id-0".to_string());
671        let candidates: Vec<Component> = vec![
672            Component::new("candidate1".to_string(), "id-1".to_string()),
673            Component::new("candidate2".to_string(), "id-2".to_string()),
674        ];
675        let candidate_refs: Vec<&Component> = candidates.iter().collect();
676
677        // With threshold 0.8, should find a match
678        let result = matcher.find_best_match(&target, &candidate_refs, 0.8);
679        assert!(result.is_some());
680
681        // With threshold 0.9, should not find a match
682        let result = matcher.find_best_match(&target, &candidate_refs, 0.9);
683        assert!(result.is_none());
684    }
685
686    #[test]
687    fn test_match_explanation_matched() {
688        let explanation =
689            MatchExplanation::matched(MatchTier::ExactIdentifier, 1.0, "Test match reason");
690
691        assert!(explanation.is_match);
692        assert_eq!(explanation.score, 1.0);
693        assert_eq!(explanation.tier, MatchTier::ExactIdentifier);
694        assert!(explanation.summary().contains("MATCH"));
695        assert!(explanation.summary().contains("100%"));
696    }
697
698    #[test]
699    fn test_match_explanation_no_match() {
700        let explanation = MatchExplanation::no_match("Components are too different");
701
702        assert!(!explanation.is_match);
703        assert_eq!(explanation.score, 0.0);
704        assert_eq!(explanation.tier, MatchTier::None);
705        assert!(explanation.summary().contains("NO MATCH"));
706    }
707
708    #[test]
709    fn test_match_explanation_with_breakdown() {
710        let explanation = MatchExplanation::matched(MatchTier::Fuzzy, 0.85, "Fuzzy match")
711            .with_score_component(ScoreComponent {
712                name: "Jaro-Winkler".to_string(),
713                weight: 0.7,
714                raw_score: 0.9,
715                weighted_score: 0.63,
716                description: "name similarity".to_string(),
717            })
718            .with_score_component(ScoreComponent {
719                name: "Levenshtein".to_string(),
720                weight: 0.3,
721                raw_score: 0.73,
722                weighted_score: 0.22,
723                description: "edit distance".to_string(),
724            })
725            .with_normalization("lowercase");
726
727        assert_eq!(explanation.score_breakdown.len(), 2);
728        assert_eq!(explanation.normalizations_applied.len(), 1);
729
730        let detailed = explanation.detailed();
731        assert!(detailed.contains("Score breakdown:"));
732        assert!(detailed.contains("Jaro-Winkler"));
733        assert!(detailed.contains("Normalizations: lowercase"));
734    }
735
736    #[test]
737    fn test_match_explanation_display() {
738        let explanation = MatchExplanation::matched(MatchTier::Alias, 0.95, "Known alias");
739        let display = format!("{}", explanation);
740        assert!(display.contains("MATCH"));
741        assert!(display.contains("95%"));
742    }
743}